diff mbox series

[RFC,3/4] spapr: Add NVDIMM device support

Message ID 154943078167.27958.5009288263168039462.stgit@lep8c.aus.stglabs.ibm.com
State New
Headers show
Series ppc: spapr: virtual NVDIMM support | expand

Commit Message

Shivaprasad G Bhat Feb. 6, 2019, 5:26 a.m. UTC
Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power (May have
to re-look at this later).  Create the required DT entries for the
device (some entries have dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

This is how it can be used ..
Add nvdimm=on to the qemu machine argument.
Ex : -machine pseries,nvdimm=on
For coldplug, the device to be added in qemu command line as shown below
-object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
-device nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
device_add nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0

Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
               [Early implementation]
---
 default-configs/ppc64-softmmu.mak |    1 
 hw/ppc/spapr.c                    |  212 +++++++++++++++++++++++++++++++++++--
 hw/ppc/spapr_drc.c                |   17 +++
 hw/ppc/spapr_events.c             |    4 +
 include/hw/ppc/spapr.h            |   10 ++
 include/hw/ppc/spapr_drc.h        |    9 ++
 6 files changed, 241 insertions(+), 12 deletions(-)

Comments

David Gibson Feb. 12, 2019, 1:49 a.m. UTC | #1
On Tue, Feb 05, 2019 at 11:26:27PM -0600, Shivaprasad G Bhat wrote:
> Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
> device interface in QEMU to support virtual NVDIMM devices for Power (May have
> to re-look at this later).  Create the required DT entries for the
> device (some entries have dummy values right now).
> 
> The patch creates the required DT node and sends a hotplug
> interrupt to the guest. Guest is expected to undertake the normal
> DR resource add path in response and start issuing PAPR SCM hcalls.
> 
> This is how it can be used ..
> Add nvdimm=on to the qemu machine argument.
> Ex : -machine pseries,nvdimm=on
> For coldplug, the device to be added in qemu command line as shown below
> -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
> -device nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
> 
> For hotplug, the device to be added from monitor as below
> object_add memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
> device_add nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
> 
> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
>                [Early implementation]
> ---
>  default-configs/ppc64-softmmu.mak |    1 
>  hw/ppc/spapr.c                    |  212 +++++++++++++++++++++++++++++++++++--
>  hw/ppc/spapr_drc.c                |   17 +++
>  hw/ppc/spapr_events.c             |    4 +
>  include/hw/ppc/spapr.h            |   10 ++
>  include/hw/ppc/spapr_drc.h        |    9 ++
>  6 files changed, 241 insertions(+), 12 deletions(-)
> 
> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> index 7f34ad0528..b6e1aa5125 100644
> --- a/default-configs/ppc64-softmmu.mak
> +++ b/default-configs/ppc64-softmmu.mak
> @@ -20,4 +20,5 @@ CONFIG_XIVE=$(CONFIG_PSERIES)
>  CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
>  CONFIG_MEM_DEVICE=y
>  CONFIG_DIMM=y
> +CONFIG_NVDIMM=y
>  CONFIG_SPAPR_RNG=y
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 0fcdd35cbe..7e7a1a8041 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -73,6 +73,7 @@
>  #include "qemu/cutils.h"
>  #include "hw/ppc/spapr_cpu_core.h"
>  #include "hw/mem/memory-device.h"
> +#include "hw/mem/nvdimm.h"
>  
>  #include <libfdt.h>
>  
> @@ -690,6 +691,7 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>      uint8_t *int_buf, *cur_index, buf_len;
>      int ret;
>      uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> +    uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
>      uint64_t addr, cur_addr, size;
>      uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
>      uint64_t mem_end = machine->device_memory->base +
> @@ -726,15 +728,24 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>              nr_entries++;
>          }
>  
> -        /* Entry for DIMM */
> -        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> -        g_assert(drc);
> -        elem = spapr_get_drconf_cell(size / lmb_size, addr,
> -                                     spapr_drc_index(drc), node,
> -                                     SPAPR_LMB_FLAGS_ASSIGNED);
> +        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
> +            /* Entry for NVDIMM */
> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, addr / scm_block_size);
> +            g_assert(drc);
> +            elem = spapr_get_drconf_cell(size / scm_block_size, addr,
> +                                         spapr_drc_index(drc), -1, 0);
> +            cur_addr = ROUND_UP(addr + size, scm_block_size);
> +        } else {
> +            /* Entry for DIMM */
> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> +            g_assert(drc);
> +            elem = spapr_get_drconf_cell(size / lmb_size, addr,
> +                                         spapr_drc_index(drc), node,
> +                                         SPAPR_LMB_FLAGS_ASSIGNED);
> +            cur_addr = addr + size;
> +        }
>          QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
>          nr_entries++;
> -        cur_addr = addr + size;
>      }
>  
>      /* Entry for remaining hotpluggable area */
> @@ -1225,6 +1236,42 @@ static void spapr_dt_hypervisor(sPAPRMachineState *spapr, void *fdt)
>      }
>  }
>  
> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset,
> +                                      uint32_t node, uint64_t addr,
> +                                      uint64_t size, uint64_t label_size);

Re-ordering the code is generally preferred to static forward declarations.

> +static void spapr_create_nvdimm(void *fdt)

I'm trying to standardize on spapr_dt_*() for functions which generate
bits of the device tree.

> +{
> +    int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
> +    GSList *dimms = NULL;
> +
> +    if (offset < 0) {
> +        offset = fdt_add_subnode(fdt, 0, "persistent-memory");
> +        _FDT(offset);
> +        _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x2)));
> +        _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
> +        _FDT((fdt_setprop_string(fdt, offset, "name", "persistent-memory")));

No need to explicitly set the "name" property, that's implicit in the
node name.

> +        _FDT((fdt_setprop_string(fdt, offset, "device_type",
> +                                 "ibm,persistent-memory")));
> +    }
> +
> +    /*NB : Add drc-info array here */
> +
> +    /* Create DT entries for cold plugged NVDIMM devices */
> +    dimms = nvdimm_get_device_list();
> +    for (; dimms; dimms = dimms->next) {
> +        NVDIMMDevice *nvdimm = dimms->data;
> +        PCDIMMDevice *di = PC_DIMM(nvdimm);
> +        uint64_t lsize = nvdimm->label_size;
> +        int size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
> +                                           NULL);
> +
> +        spapr_populate_nvdimm_node(fdt, offset, di->node, di->addr,
> +                                   size, lsize);

It might be cleaner to just pass the NVDIMMDevice * rather than
umpteen parameters.

> +    }
> +    g_slist_free(dimms);
> +    return;
> +}
> +
>  static void *spapr_build_fdt(sPAPRMachineState *spapr)
>  {
>      MachineState *machine = MACHINE(spapr);
> @@ -1348,6 +1395,11 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr)
>          exit(1);
>      }
>  
> +    /* NVDIMM devices */
> +    if (spapr->nvdimm_enabled) {
> +        spapr_create_nvdimm(fdt);
> +    }
> +
>      return fdt;
>  }
>  
> @@ -3143,6 +3195,20 @@ static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
>      }
>  }
>  
> +static bool spapr_get_nvdimm(Object *obj, Error **errp)
> +{
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    return spapr->nvdimm_enabled;
> +}
> +
> +static void spapr_set_nvdimm(Object *obj, bool value, Error **errp)
> +{
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    spapr->nvdimm_enabled = value;
> +}
> +
>  static void spapr_instance_init(Object *obj)
>  {
>      sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> @@ -3188,6 +3254,11 @@ static void spapr_instance_init(Object *obj)
>      object_property_set_description(obj, "ic-mode",
>                   "Specifies the interrupt controller mode (xics, xive, dual)",
>                   NULL);
> +    object_property_add_bool(obj, "nvdimm",
> +                            spapr_get_nvdimm, spapr_set_nvdimm, NULL);
> +    object_property_set_description(obj, "nvdimm",
> +                                    "Enable support for nvdimm devices",
> +                                    NULL);

I'm not seeing a lot of point to this machine parameter.

>  }
>  
>  static void spapr_machine_finalizefn(Object *obj)
> @@ -3267,12 +3338,103 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
>      }
>  }
>  
> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset, uint32_t node,
> +                                      uint64_t addr, uint64_t size,
> +                                      uint64_t label_size)
> +{
> +    int offset;
> +    char buf[40];
> +    GString *lcode = g_string_sized_new(10);
> +    sPAPRDRConnector *drc;
> +    QemuUUID uuid;
> +    uint32_t drc_idx;
> +    uint32_t associativity[] = {
> +        cpu_to_be32(0x4), /* length */
> +        cpu_to_be32(0x0), cpu_to_be32(0x0),
> +        cpu_to_be32(0x0), cpu_to_be32(node)
> +    };
> +
> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> +    g_assert(drc);
> +
> +    drc_idx = spapr_drc_index(drc);
> +
> +    sprintf(buf, "pmem@%x", drc_idx);
> +    offset = fdt_add_subnode(fdt, fdt_offset, buf);

"fdt_offset" vs. "offset" isn't very obvious.  Maybe parent_offset /
child_offset or something?

> +    _FDT(offset);
> +
> +    _FDT((fdt_setprop_cell(fdt, offset, "reg", drc_idx)));
> +    _FDT((fdt_setprop_string(fdt, offset, "compatible", "ibm,pmemory")));
> +    _FDT((fdt_setprop_string(fdt, offset, "name", "pmem")));

Again, no need to set "name".

> +    _FDT((fdt_setprop_string(fdt, offset, "device_type", "ibm,pmemory")));
> +
> +    /*NB : Supposed to be random strings. Currently empty 10 strings! */
> +    _FDT((fdt_setprop(fdt, offset, "ibm,loc-code", lcode->str, lcode->len)));
> +    g_string_free(lcode, TRUE);

I think leaving this property out would be preferable to including it
but putting nothing useful there.

> +
> +    _FDT((fdt_setprop(fdt, offset, "ibm,associativity", associativity,
> +                      sizeof(associativity))));
> +    g_random_set_seed(drc_idx);
> +    qemu_uuid_generate(&uuid);

This looks bogus.  I'm guessing the set seed is so that you generate
consistent UUIDs for the same NVDIMM in a guest.  First, that's making
a lot of assumptions about how qemu_uuid_generate() works that aren't
really warranted.  Second, it poisons the RNG for anything running
after this which actually wants (pseudo) random numbers.

I think you need to make the UUID a property of the device instead.

> +
> +    qemu_uuid_unparse(&uuid, buf);
> +    _FDT((fdt_setprop_string(fdt, offset, "ibm,unit-guid", buf)));
> +
> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_idx)));
> +
> +    /*NB : What it should be? */
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,latency-attribute", 828));
> +
> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,block-size",
> +                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,number-of-blocks",
> +                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,metadata-size", label_size)));
> +
> +    return offset;
> +}
> +
> +static void spapr_add_nvdimm(DeviceState *dev, uint64_t addr,
> +                             uint64_t size, uint32_t node,
> +                             Error **errp)
> +{
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_hotplug_handler(dev));
> +    sPAPRDRConnector *drc;
> +    bool hotplugged = spapr_drc_hotplugged(dev);
> +    NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
> +    void *fdt;
> +    int fdt_offset, fdt_size;
> +    Error *local_err = NULL;
> +
> +    spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM,
> +                           addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> +    g_assert(drc);

Creating the DRC in the hotplug path looks bogus.  Generally the DRC
has to exist before you can even attempt to plug the device.

> +    fdt = create_device_tree(&fdt_size);
> +    fdt_offset = spapr_populate_nvdimm_node(fdt, 0, node, addr,
> +                                            size, nvdimm->label_size);
> +
> +    spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
> +    if (hotplugged) {
> +        spapr_hotplug_req_add_by_index(drc);
> +    }
> +}
> +
>  static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>                                Error **errp)
>  {
>      Error *local_err = NULL;
>      sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
>      PCDIMMDevice *dimm = PC_DIMM(dev);
> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>      uint64_t size, addr;
>      uint32_t node;
>  
> @@ -3291,9 +3453,14 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>  
>      node = object_property_get_uint(OBJECT(dev), PC_DIMM_NODE_PROP,
>                                      &error_abort);
> -    spapr_add_lmbs(dev, addr, size, node,
> -                   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> -                   &local_err);
> +    if (!is_nvdimm) {
> +        spapr_add_lmbs(dev, addr, size, node,
> +                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> +                       &local_err);
> +    } else {
> +        spapr_add_nvdimm(dev, addr, size, node, &local_err);
> +    }
> +
>      if (local_err) {
>          goto out_unplug;
>      }
> @@ -3311,6 +3478,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>  {
>      const sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
>      sPAPRMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>      PCDIMMDevice *dimm = PC_DIMM(dev);
>      Error *local_err = NULL;
>      uint64_t size;
> @@ -3328,10 +3496,30 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>          return;
>      }
>  
> -    if (size % SPAPR_MEMORY_BLOCK_SIZE) {
> +    if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) {
>          error_setg(errp, "Hotplugged memory size must be a multiple of "
> -                      "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
> +                          "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
>          return;
> +    } else if (is_nvdimm) {
> +        NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
> +        if ((nvdimm->label_size + size) % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
> +            error_setg(errp, "NVDIMM memory size must be a multiple of "
> +                       "%" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
> +            return;
> +        }
> +        if (((nvdimm->label_size + size) / SPAPR_MINIMUM_SCM_BLOCK_SIZE) == 1) {
> +            error_setg(errp, "NVDIMM size must be atleast "
> +                       "%" PRIu64 "MB", 2 * SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
> +            return;
> +        }
> +
> +        /* Align to scm block size, exclude the label */
> +        memory_device_set_region_size(MEMORY_DEVICE(nvdimm),
> +               QEMU_ALIGN_DOWN(size, SPAPR_MINIMUM_SCM_BLOCK_SIZE), &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
>      }
>  
>      memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> index 2edb7d1e9c..94ddd102cc 100644
> --- a/hw/ppc/spapr_drc.c
> +++ b/hw/ppc/spapr_drc.c
> @@ -696,6 +696,16 @@ static void spapr_drc_lmb_class_init(ObjectClass *k, void *data)
>      drck->release = spapr_lmb_release;
>  }
>  
> +static void spapr_drc_pmem_class_init(ObjectClass *k, void *data)
> +{
> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_CLASS(k);
> +
> +    drck->typeshift = SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM;
> +    drck->typename = "MEM";
> +    drck->drc_name_prefix = "PMEM ";
> +    drck->release = NULL;
> +}
> +
>  static const TypeInfo spapr_dr_connector_info = {
>      .name          = TYPE_SPAPR_DR_CONNECTOR,
>      .parent        = TYPE_DEVICE,
> @@ -739,6 +749,12 @@ static const TypeInfo spapr_drc_lmb_info = {
>      .class_init    = spapr_drc_lmb_class_init,
>  };
>  
> +static const TypeInfo spapr_drc_pmem_info = {
> +    .name          = TYPE_SPAPR_DRC_PMEM,
> +    .parent        = TYPE_SPAPR_DRC_LOGICAL,
> +    .class_init    = spapr_drc_pmem_class_init,
> +};
> +
>  /* helper functions for external users */
>  
>  sPAPRDRConnector *spapr_drc_by_index(uint32_t index)
> @@ -1189,6 +1205,7 @@ static void spapr_drc_register_types(void)
>      type_register_static(&spapr_drc_cpu_info);
>      type_register_static(&spapr_drc_pci_info);
>      type_register_static(&spapr_drc_lmb_info);
> +    type_register_static(&spapr_drc_pmem_info);
>  
>      spapr_rtas_register(RTAS_SET_INDICATOR, "set-indicator",
>                          rtas_set_indicator);
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index 32719a1b72..a4fed84346 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -193,6 +193,7 @@ struct rtas_event_log_v6_hp {
>  #define RTAS_LOG_V6_HP_TYPE_SLOT                         3
>  #define RTAS_LOG_V6_HP_TYPE_PHB                          4
>  #define RTAS_LOG_V6_HP_TYPE_PCI                          5
> +#define RTAS_LOG_V6_HP_TYPE_PMEM                         6
>      uint8_t hotplug_action;
>  #define RTAS_LOG_V6_HP_ACTION_ADD                        1
>  #define RTAS_LOG_V6_HP_ACTION_REMOVE                     2
> @@ -526,6 +527,9 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
>      case SPAPR_DR_CONNECTOR_TYPE_CPU:
>          hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_CPU;
>          break;
> +    case SPAPR_DR_CONNECTOR_TYPE_PMEM:
> +        hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PMEM;
> +        break;
>      default:
>          /* we shouldn't be signaling hotplug events for resources
>           * that don't support them
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index a947a0a0dc..21a9709afe 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -187,6 +187,7 @@ struct sPAPRMachineState {
>  
>      bool cmd_line_caps[SPAPR_CAP_NUM];
>      sPAPRCapabilities def, eff, mig;
> +    bool nvdimm_enabled;
>  };
>  
>  #define H_SUCCESS         0
> @@ -798,6 +799,15 @@ int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
>  #define SPAPR_LMB_FLAGS_DRC_INVALID 0x00000020
>  #define SPAPR_LMB_FLAGS_RESERVED 0x00000080
>  
> +/*
> + * The nvdimm size should be aligned to SCM block size.
> + * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
> + * inorder to have SCM regions not to overlap with dimm memory regions.
> + * The SCM devices can have variable block sizes. For now, fixing the
> + * block size to the minimum value.
> + */
> +#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
> +
>  void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg);
>  
>  #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
> diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
> index f6ff32e7e2..65925d00b1 100644
> --- a/include/hw/ppc/spapr_drc.h
> +++ b/include/hw/ppc/spapr_drc.h
> @@ -70,6 +70,13 @@
>  #define SPAPR_DRC_LMB(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
>                                          TYPE_SPAPR_DRC_LMB)
>  
> +#define TYPE_SPAPR_DRC_PMEM "spapr-drc-pmem"
> +#define SPAPR_DRC_PMEM_GET_CLASS(obj) \
> +        OBJECT_GET_CLASS(sPAPRDRConnectorClass, obj, TYPE_SPAPR_DRC_PMEM)
> +#define SPAPR_DRC_PMEM_CLASS(klass) \
> +        OBJECT_CLASS_CHECK(sPAPRDRConnectorClass, klass, TYPE_SPAPR_DRC_PMEM)
> +#define SPAPR_DRC_PMEM(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
> +                                        TYPE_SPAPR_DRC_PMEM)
>  /*
>   * Various hotplug types managed by sPAPRDRConnector
>   *
> @@ -87,6 +94,7 @@ typedef enum {
>      SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO = 3,
>      SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI = 4,
>      SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB = 8,
> +    SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM = 9,
>  } sPAPRDRConnectorTypeShift;
>  
>  typedef enum {
> @@ -96,6 +104,7 @@ typedef enum {
>      SPAPR_DR_CONNECTOR_TYPE_VIO = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO,
>      SPAPR_DR_CONNECTOR_TYPE_PCI = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI,
>      SPAPR_DR_CONNECTOR_TYPE_LMB = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB,
> +    SPAPR_DR_CONNECTOR_TYPE_PMEM = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM,
>  } sPAPRDRConnectorType;
>  
>  /*
>
Shivaprasad G Bhat Feb. 15, 2019, 11:11 a.m. UTC | #2
Thanks for the comments David. Please find my replies inline..


On 02/12/2019 07:19 AM, David Gibson wrote:
> On Tue, Feb 05, 2019 at 11:26:27PM -0600, Shivaprasad G Bhat wrote:
>> Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
>> device interface in QEMU to support virtual NVDIMM devices for Power (May have
>> to re-look at this later).  Create the required DT entries for the
>> device (some entries have dummy values right now).
>>
>> The patch creates the required DT node and sends a hotplug
>> interrupt to the guest. Guest is expected to undertake the normal
>> DR resource add path in response and start issuing PAPR SCM hcalls.
>>
>> This is how it can be used ..
>> Add nvdimm=on to the qemu machine argument.
>> Ex : -machine pseries,nvdimm=on
>> For coldplug, the device to be added in qemu command line as shown below
>> -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
>> -device nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
>>
>> For hotplug, the device to be added from monitor as below
>> object_add memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
>> device_add nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
>>
>> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
>> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
>>                 [Early implementation]
>> ---
>>   default-configs/ppc64-softmmu.mak |    1
>>   hw/ppc/spapr.c                    |  212 +++++++++++++++++++++++++++++++++++--
>>   hw/ppc/spapr_drc.c                |   17 +++
>>   hw/ppc/spapr_events.c             |    4 +
>>   include/hw/ppc/spapr.h            |   10 ++
>>   include/hw/ppc/spapr_drc.h        |    9 ++
>>   6 files changed, 241 insertions(+), 12 deletions(-)
>>
>> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
>> index 7f34ad0528..b6e1aa5125 100644
>> --- a/default-configs/ppc64-softmmu.mak
>> +++ b/default-configs/ppc64-softmmu.mak
>> @@ -20,4 +20,5 @@ CONFIG_XIVE=$(CONFIG_PSERIES)
>>   CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
>>   CONFIG_MEM_DEVICE=y
>>   CONFIG_DIMM=y
>> +CONFIG_NVDIMM=y
>>   CONFIG_SPAPR_RNG=y
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 0fcdd35cbe..7e7a1a8041 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -73,6 +73,7 @@
>>   #include "qemu/cutils.h"
>>   #include "hw/ppc/spapr_cpu_core.h"
>>   #include "hw/mem/memory-device.h"
>> +#include "hw/mem/nvdimm.h"
>>   
>>   #include <libfdt.h>
>>   
>> @@ -690,6 +691,7 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>>       uint8_t *int_buf, *cur_index, buf_len;
>>       int ret;
>>       uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
>> +    uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
>>       uint64_t addr, cur_addr, size;
>>       uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
>>       uint64_t mem_end = machine->device_memory->base +
>> @@ -726,15 +728,24 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>>               nr_entries++;
>>           }
>>   
>> -        /* Entry for DIMM */
>> -        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
>> -        g_assert(drc);
>> -        elem = spapr_get_drconf_cell(size / lmb_size, addr,
>> -                                     spapr_drc_index(drc), node,
>> -                                     SPAPR_LMB_FLAGS_ASSIGNED);
>> +        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
>> +            /* Entry for NVDIMM */
>> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, addr / scm_block_size);
>> +            g_assert(drc);
>> +            elem = spapr_get_drconf_cell(size / scm_block_size, addr,
>> +                                         spapr_drc_index(drc), -1, 0);
>> +            cur_addr = ROUND_UP(addr + size, scm_block_size);
>> +        } else {
>> +            /* Entry for DIMM */
>> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
>> +            g_assert(drc);
>> +            elem = spapr_get_drconf_cell(size / lmb_size, addr,
>> +                                         spapr_drc_index(drc), node,
>> +                                         SPAPR_LMB_FLAGS_ASSIGNED);
>> +            cur_addr = addr + size;
>> +        }
>>           QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
>>           nr_entries++;
>> -        cur_addr = addr + size;
>>       }
>>   
>>       /* Entry for remaining hotpluggable area */
>> @@ -1225,6 +1236,42 @@ static void spapr_dt_hypervisor(sPAPRMachineState *spapr, void *fdt)
>>       }
>>   }
>>   
>> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset,
>> +                                      uint32_t node, uint64_t addr,
>> +                                      uint64_t size, uint64_t label_size);
> Re-ordering the code is generally preferred to static forward declarations.
Ok
>> +static void spapr_create_nvdimm(void *fdt)
> I'm trying to standardize on spapr_dt_*() for functions which generate
> bits of the device tree.
Ok. Will rename to spapr_dt_create_nvdimm
>> +{
>> +    int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
>> +    GSList *dimms = NULL;
>> +
>> +    if (offset < 0) {
>> +        offset = fdt_add_subnode(fdt, 0, "persistent-memory");
>> +        _FDT(offset);
>> +        _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x2)));
>> +        _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
>> +        _FDT((fdt_setprop_string(fdt, offset, "name", "persistent-memory")));
> No need to explicitly set the "name" property, that's implicit in the
> node name.
Ok
>> +        _FDT((fdt_setprop_string(fdt, offset, "device_type",
>> +                                 "ibm,persistent-memory")));
>> +    }
>> +
>> +    /*NB : Add drc-info array here */
>> +
>> +    /* Create DT entries for cold plugged NVDIMM devices */
>> +    dimms = nvdimm_get_device_list();
>> +    for (; dimms; dimms = dimms->next) {
>> +        NVDIMMDevice *nvdimm = dimms->data;
>> +        PCDIMMDevice *di = PC_DIMM(nvdimm);
>> +        uint64_t lsize = nvdimm->label_size;
>> +        int size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
>> +                                           NULL);
>> +
>> +        spapr_populate_nvdimm_node(fdt, offset, di->node, di->addr,
>> +                                   size, lsize);
> It might be cleaner to just pass the NVDIMMDevice * rather than
> umpteen parameters.
Ok.
>> +    }
>> +    g_slist_free(dimms);
>> +    return;
>> +}
>> +
>>   static void *spapr_build_fdt(sPAPRMachineState *spapr)
>>   {
>>       MachineState *machine = MACHINE(spapr);
>> @@ -1348,6 +1395,11 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr)
>>           exit(1);
>>       }
>>   
>> +    /* NVDIMM devices */
>> +    if (spapr->nvdimm_enabled) {
>> +        spapr_create_nvdimm(fdt);
>> +    }
>> +
>>       return fdt;
>>   }
>>   
>> @@ -3143,6 +3195,20 @@ static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
>>       }
>>   }
>>   
>> +static bool spapr_get_nvdimm(Object *obj, Error **errp)
>> +{
>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>> +
>> +    return spapr->nvdimm_enabled;
>> +}
>> +
>> +static void spapr_set_nvdimm(Object *obj, bool value, Error **errp)
>> +{
>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>> +
>> +    spapr->nvdimm_enabled = value;
>> +}
>> +
>>   static void spapr_instance_init(Object *obj)
>>   {
>>       sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>> @@ -3188,6 +3254,11 @@ static void spapr_instance_init(Object *obj)
>>       object_property_set_description(obj, "ic-mode",
>>                    "Specifies the interrupt controller mode (xics, xive, dual)",
>>                    NULL);
>> +    object_property_add_bool(obj, "nvdimm",
>> +                            spapr_get_nvdimm, spapr_set_nvdimm, NULL);
>> +    object_property_set_description(obj, "nvdimm",
>> +                                    "Enable support for nvdimm devices",
>> +                                    NULL);
> I'm not seeing a lot of point to this machine parameter.
Just followed what the x86 is doing here.

>>   }
>>   
>>   static void spapr_machine_finalizefn(Object *obj)
>> @@ -3267,12 +3338,103 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
>>       }
>>   }
>>   
>> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset, uint32_t node,
>> +                                      uint64_t addr, uint64_t size,
>> +                                      uint64_t label_size)
>> +{
>> +    int offset;
>> +    char buf[40];
>> +    GString *lcode = g_string_sized_new(10);
>> +    sPAPRDRConnector *drc;
>> +    QemuUUID uuid;
>> +    uint32_t drc_idx;
>> +    uint32_t associativity[] = {
>> +        cpu_to_be32(0x4), /* length */
>> +        cpu_to_be32(0x0), cpu_to_be32(0x0),
>> +        cpu_to_be32(0x0), cpu_to_be32(node)
>> +    };
>> +
>> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
>> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>> +    g_assert(drc);
>> +
>> +    drc_idx = spapr_drc_index(drc);
>> +
>> +    sprintf(buf, "pmem@%x", drc_idx);
>> +    offset = fdt_add_subnode(fdt, fdt_offset, buf);
> "fdt_offset" vs. "offset" isn't very obvious.  Maybe parent_offset /
> child_offset or something?
Ok
>> +    _FDT(offset);
>> +
>> +    _FDT((fdt_setprop_cell(fdt, offset, "reg", drc_idx)));
>> +    _FDT((fdt_setprop_string(fdt, offset, "compatible", "ibm,pmemory")));
>> +    _FDT((fdt_setprop_string(fdt, offset, "name", "pmem")));
> Again, no need to set "name".
Ok
>> +    _FDT((fdt_setprop_string(fdt, offset, "device_type", "ibm,pmemory")));
>> +
>> +    /*NB : Supposed to be random strings. Currently empty 10 strings! */
>> +    _FDT((fdt_setprop(fdt, offset, "ibm,loc-code", lcode->str, lcode->len)));
>> +    g_string_free(lcode, TRUE);
> I think leaving this property out would be preferable to including it
> but putting nothing useful there.
Ok.
>> +
>> +    _FDT((fdt_setprop(fdt, offset, "ibm,associativity", associativity,
>> +                      sizeof(associativity))));
>> +    g_random_set_seed(drc_idx);
>> +    qemu_uuid_generate(&uuid);
> This looks bogus.  I'm guessing the set seed is so that you generate
> consistent UUIDs for the same NVDIMM in a guest.  First, that's making
> a lot of assumptions about how qemu_uuid_generate() works that aren't
> really warranted.  Second, it poisons the RNG for anything running
> after this which actually wants (pseudo) random numbers.
>
> I think you need to make the UUID a property of the device instead.
Ok.
>> +
>> +    qemu_uuid_unparse(&uuid, buf);
>> +    _FDT((fdt_setprop_string(fdt, offset, "ibm,unit-guid", buf)));
>> +
>> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_idx)));
>> +
>> +    /*NB : What it should be? */
>> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,latency-attribute", 828));
>> +
>> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,block-size",
>> +                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
>> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,number-of-blocks",
>> +                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
>> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,metadata-size", label_size)));
>> +
>> +    return offset;
>> +}
>> +
>> +static void spapr_add_nvdimm(DeviceState *dev, uint64_t addr,
>> +                             uint64_t size, uint32_t node,
>> +                             Error **errp)
>> +{
>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_hotplug_handler(dev));
>> +    sPAPRDRConnector *drc;
>> +    bool hotplugged = spapr_drc_hotplugged(dev);
>> +    NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
>> +    void *fdt;
>> +    int fdt_offset, fdt_size;
>> +    Error *local_err = NULL;
>> +
>> +    spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM,
>> +                           addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
>> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>> +    g_assert(drc);
> Creating the DRC in the hotplug path looks bogus.  Generally the DRC
> has to exist before you can even attempt to plug the device.

We dont really know how many DRC to create. Unlike memory hotplug
where we know how many LMBs are required to fit till the maxmem, in this
case we dont know how many NVDIMM devices  guest can have. That is the
reason I am creating the DRC on demand. I'll see if it is possible to 
address this
by putting a cap on maximum number of NVDIMM devices a guest can have.


>> +    fdt = create_device_tree(&fdt_size);
>> +    fdt_offset = spapr_populate_nvdimm_node(fdt, 0, node, addr,
>> +                                            size, nvdimm->label_size);
>> +
>> +    spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +
>> +    if (hotplugged) {
>> +        spapr_hotplug_req_add_by_index(drc);
>> +    }
>> +}
>> +
>>   static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>                                 Error **errp)
>>   {
>>       Error *local_err = NULL;
>>       sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
>>       PCDIMMDevice *dimm = PC_DIMM(dev);
>> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>>       uint64_t size, addr;
>>       uint32_t node;
>>   
>> @@ -3291,9 +3453,14 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>   
>>       node = object_property_get_uint(OBJECT(dev), PC_DIMM_NODE_PROP,
>>                                       &error_abort);
>> -    spapr_add_lmbs(dev, addr, size, node,
>> -                   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
>> -                   &local_err);
>> +    if (!is_nvdimm) {
>> +        spapr_add_lmbs(dev, addr, size, node,
>> +                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
>> +                       &local_err);
>> +    } else {
>> +        spapr_add_nvdimm(dev, addr, size, node, &local_err);
>> +    }
>> +
>>       if (local_err) {
>>           goto out_unplug;
>>       }
>> @@ -3311,6 +3478,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>   {
>>       const sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
>>       sPAPRMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
>> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>>       PCDIMMDevice *dimm = PC_DIMM(dev);
>>       Error *local_err = NULL;
>>       uint64_t size;
>> @@ -3328,10 +3496,30 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>           return;
>>       }
>>   
>> -    if (size % SPAPR_MEMORY_BLOCK_SIZE) {
>> +    if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) {
>>           error_setg(errp, "Hotplugged memory size must be a multiple of "
>> -                      "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
>> +                          "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
>>           return;
>> +    } else if (is_nvdimm) {
>> +        NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
>> +        if ((nvdimm->label_size + size) % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
>> +            error_setg(errp, "NVDIMM memory size must be a multiple of "
>> +                       "%" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
>> +            return;
>> +        }
>> +        if (((nvdimm->label_size + size) / SPAPR_MINIMUM_SCM_BLOCK_SIZE) == 1) {
>> +            error_setg(errp, "NVDIMM size must be atleast "
>> +                       "%" PRIu64 "MB", 2 * SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
>> +            return;
>> +        }
>> +
>> +        /* Align to scm block size, exclude the label */
>> +        memory_device_set_region_size(MEMORY_DEVICE(nvdimm),
>> +               QEMU_ALIGN_DOWN(size, SPAPR_MINIMUM_SCM_BLOCK_SIZE), &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>>       }
>>   
>>       memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
>> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
>> index 2edb7d1e9c..94ddd102cc 100644
>> --- a/hw/ppc/spapr_drc.c
>> +++ b/hw/ppc/spapr_drc.c
>> @@ -696,6 +696,16 @@ static void spapr_drc_lmb_class_init(ObjectClass *k, void *data)
>>       drck->release = spapr_lmb_release;
>>   }
>>   
>> +static void spapr_drc_pmem_class_init(ObjectClass *k, void *data)
>> +{
>> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_CLASS(k);
>> +
>> +    drck->typeshift = SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM;
>> +    drck->typename = "MEM";
>> +    drck->drc_name_prefix = "PMEM ";
>> +    drck->release = NULL;
>> +}
>> +
>>   static const TypeInfo spapr_dr_connector_info = {
>>       .name          = TYPE_SPAPR_DR_CONNECTOR,
>>       .parent        = TYPE_DEVICE,
>> @@ -739,6 +749,12 @@ static const TypeInfo spapr_drc_lmb_info = {
>>       .class_init    = spapr_drc_lmb_class_init,
>>   };
>>   
>> +static const TypeInfo spapr_drc_pmem_info = {
>> +    .name          = TYPE_SPAPR_DRC_PMEM,
>> +    .parent        = TYPE_SPAPR_DRC_LOGICAL,
>> +    .class_init    = spapr_drc_pmem_class_init,
>> +};
>> +
>>   /* helper functions for external users */
>>   
>>   sPAPRDRConnector *spapr_drc_by_index(uint32_t index)
>> @@ -1189,6 +1205,7 @@ static void spapr_drc_register_types(void)
>>       type_register_static(&spapr_drc_cpu_info);
>>       type_register_static(&spapr_drc_pci_info);
>>       type_register_static(&spapr_drc_lmb_info);
>> +    type_register_static(&spapr_drc_pmem_info);
>>   
>>       spapr_rtas_register(RTAS_SET_INDICATOR, "set-indicator",
>>                           rtas_set_indicator);
>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>> index 32719a1b72..a4fed84346 100644
>> --- a/hw/ppc/spapr_events.c
>> +++ b/hw/ppc/spapr_events.c
>> @@ -193,6 +193,7 @@ struct rtas_event_log_v6_hp {
>>   #define RTAS_LOG_V6_HP_TYPE_SLOT                         3
>>   #define RTAS_LOG_V6_HP_TYPE_PHB                          4
>>   #define RTAS_LOG_V6_HP_TYPE_PCI                          5
>> +#define RTAS_LOG_V6_HP_TYPE_PMEM                         6
>>       uint8_t hotplug_action;
>>   #define RTAS_LOG_V6_HP_ACTION_ADD                        1
>>   #define RTAS_LOG_V6_HP_ACTION_REMOVE                     2
>> @@ -526,6 +527,9 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
>>       case SPAPR_DR_CONNECTOR_TYPE_CPU:
>>           hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_CPU;
>>           break;
>> +    case SPAPR_DR_CONNECTOR_TYPE_PMEM:
>> +        hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PMEM;
>> +        break;
>>       default:
>>           /* we shouldn't be signaling hotplug events for resources
>>            * that don't support them
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index a947a0a0dc..21a9709afe 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -187,6 +187,7 @@ struct sPAPRMachineState {
>>   
>>       bool cmd_line_caps[SPAPR_CAP_NUM];
>>       sPAPRCapabilities def, eff, mig;
>> +    bool nvdimm_enabled;
>>   };
>>   
>>   #define H_SUCCESS         0
>> @@ -798,6 +799,15 @@ int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
>>   #define SPAPR_LMB_FLAGS_DRC_INVALID 0x00000020
>>   #define SPAPR_LMB_FLAGS_RESERVED 0x00000080
>>   
>> +/*
>> + * The nvdimm size should be aligned to SCM block size.
>> + * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
>> + * inorder to have SCM regions not to overlap with dimm memory regions.
>> + * The SCM devices can have variable block sizes. For now, fixing the
>> + * block size to the minimum value.
>> + */
>> +#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
>> +
>>   void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg);
>>   
>>   #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
>> diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
>> index f6ff32e7e2..65925d00b1 100644
>> --- a/include/hw/ppc/spapr_drc.h
>> +++ b/include/hw/ppc/spapr_drc.h
>> @@ -70,6 +70,13 @@
>>   #define SPAPR_DRC_LMB(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
>>                                           TYPE_SPAPR_DRC_LMB)
>>   
>> +#define TYPE_SPAPR_DRC_PMEM "spapr-drc-pmem"
>> +#define SPAPR_DRC_PMEM_GET_CLASS(obj) \
>> +        OBJECT_GET_CLASS(sPAPRDRConnectorClass, obj, TYPE_SPAPR_DRC_PMEM)
>> +#define SPAPR_DRC_PMEM_CLASS(klass) \
>> +        OBJECT_CLASS_CHECK(sPAPRDRConnectorClass, klass, TYPE_SPAPR_DRC_PMEM)
>> +#define SPAPR_DRC_PMEM(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
>> +                                        TYPE_SPAPR_DRC_PMEM)
>>   /*
>>    * Various hotplug types managed by sPAPRDRConnector
>>    *
>> @@ -87,6 +94,7 @@ typedef enum {
>>       SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO = 3,
>>       SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI = 4,
>>       SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB = 8,
>> +    SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM = 9,
>>   } sPAPRDRConnectorTypeShift;
>>   
>>   typedef enum {
>> @@ -96,6 +104,7 @@ typedef enum {
>>       SPAPR_DR_CONNECTOR_TYPE_VIO = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO,
>>       SPAPR_DR_CONNECTOR_TYPE_PCI = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI,
>>       SPAPR_DR_CONNECTOR_TYPE_LMB = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB,
>> +    SPAPR_DR_CONNECTOR_TYPE_PMEM = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM,
>>   } sPAPRDRConnectorType;
>>   
>>   /*
>>
David Gibson Feb. 17, 2019, 11:02 p.m. UTC | #3
On Fri, Feb 15, 2019 at 04:41:09PM +0530, Shivaprasad G Bhat wrote:
> Thanks for the comments David. Please find my replies inline..
> 
> 
> On 02/12/2019 07:19 AM, David Gibson wrote:
> > On Tue, Feb 05, 2019 at 11:26:27PM -0600, Shivaprasad G Bhat wrote:
> > > Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
> > > device interface in QEMU to support virtual NVDIMM devices for Power (May have
> > > to re-look at this later).  Create the required DT entries for the
> > > device (some entries have dummy values right now).
> > > 
> > > The patch creates the required DT node and sends a hotplug
> > > interrupt to the guest. Guest is expected to undertake the normal
> > > DR resource add path in response and start issuing PAPR SCM hcalls.
> > > 
> > > This is how it can be used ..
> > > Add nvdimm=on to the qemu machine argument.
> > > Ex : -machine pseries,nvdimm=on
> > > For coldplug, the device to be added in qemu command line as shown below
> > > -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
> > > -device nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
> > > 
> > > For hotplug, the device to be added from monitor as below
> > > object_add memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
> > > device_add nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
> > > 
> > > Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
> > > Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> > >                 [Early implementation]
> > > ---
> > >   default-configs/ppc64-softmmu.mak |    1
> > >   hw/ppc/spapr.c                    |  212 +++++++++++++++++++++++++++++++++++--
> > >   hw/ppc/spapr_drc.c                |   17 +++
> > >   hw/ppc/spapr_events.c             |    4 +
> > >   include/hw/ppc/spapr.h            |   10 ++
> > >   include/hw/ppc/spapr_drc.h        |    9 ++
> > >   6 files changed, 241 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> > > index 7f34ad0528..b6e1aa5125 100644
> > > --- a/default-configs/ppc64-softmmu.mak
> > > +++ b/default-configs/ppc64-softmmu.mak
> > > @@ -20,4 +20,5 @@ CONFIG_XIVE=$(CONFIG_PSERIES)
> > >   CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
> > >   CONFIG_MEM_DEVICE=y
> > >   CONFIG_DIMM=y
> > > +CONFIG_NVDIMM=y
> > >   CONFIG_SPAPR_RNG=y
> > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > > index 0fcdd35cbe..7e7a1a8041 100644
> > > --- a/hw/ppc/spapr.c
> > > +++ b/hw/ppc/spapr.c
> > > @@ -73,6 +73,7 @@
> > >   #include "qemu/cutils.h"
> > >   #include "hw/ppc/spapr_cpu_core.h"
> > >   #include "hw/mem/memory-device.h"
> > > +#include "hw/mem/nvdimm.h"
> > >   #include <libfdt.h>
> > > @@ -690,6 +691,7 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
> > >       uint8_t *int_buf, *cur_index, buf_len;
> > >       int ret;
> > >       uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> > > +    uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> > >       uint64_t addr, cur_addr, size;
> > >       uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
> > >       uint64_t mem_end = machine->device_memory->base +
> > > @@ -726,15 +728,24 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
> > >               nr_entries++;
> > >           }
> > > -        /* Entry for DIMM */
> > > -        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> > > -        g_assert(drc);
> > > -        elem = spapr_get_drconf_cell(size / lmb_size, addr,
> > > -                                     spapr_drc_index(drc), node,
> > > -                                     SPAPR_LMB_FLAGS_ASSIGNED);
> > > +        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
> > > +            /* Entry for NVDIMM */
> > > +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, addr / scm_block_size);
> > > +            g_assert(drc);
> > > +            elem = spapr_get_drconf_cell(size / scm_block_size, addr,
> > > +                                         spapr_drc_index(drc), -1, 0);
> > > +            cur_addr = ROUND_UP(addr + size, scm_block_size);
> > > +        } else {
> > > +            /* Entry for DIMM */
> > > +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> > > +            g_assert(drc);
> > > +            elem = spapr_get_drconf_cell(size / lmb_size, addr,
> > > +                                         spapr_drc_index(drc), node,
> > > +                                         SPAPR_LMB_FLAGS_ASSIGNED);
> > > +            cur_addr = addr + size;
> > > +        }
> > >           QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> > >           nr_entries++;
> > > -        cur_addr = addr + size;
> > >       }
> > >       /* Entry for remaining hotpluggable area */
> > > @@ -1225,6 +1236,42 @@ static void spapr_dt_hypervisor(sPAPRMachineState *spapr, void *fdt)
> > >       }
> > >   }
> > > +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset,
> > > +                                      uint32_t node, uint64_t addr,
> > > +                                      uint64_t size, uint64_t label_size);
> > Re-ordering the code is generally preferred to static forward declarations.
> Ok
> > > +static void spapr_create_nvdimm(void *fdt)
> > I'm trying to standardize on spapr_dt_*() for functions which generate
> > bits of the device tree.
> Ok. Will rename to spapr_dt_create_nvdimm

Just spapr_dt_nvdimm() would be preferred.

> > > +{
> > > +    int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
> > > +    GSList *dimms = NULL;
> > > +
> > > +    if (offset < 0) {
> > > +        offset = fdt_add_subnode(fdt, 0, "persistent-memory");
> > > +        _FDT(offset);
> > > +        _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x2)));
> > > +        _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
> > > +        _FDT((fdt_setprop_string(fdt, offset, "name", "persistent-memory")));
> > No need to explicitly set the "name" property, that's implicit in the
> > node name.
> Ok
> > > +        _FDT((fdt_setprop_string(fdt, offset, "device_type",
> > > +                                 "ibm,persistent-memory")));
> > > +    }
> > > +
> > > +    /*NB : Add drc-info array here */
> > > +
> > > +    /* Create DT entries for cold plugged NVDIMM devices */
> > > +    dimms = nvdimm_get_device_list();
> > > +    for (; dimms; dimms = dimms->next) {
> > > +        NVDIMMDevice *nvdimm = dimms->data;
> > > +        PCDIMMDevice *di = PC_DIMM(nvdimm);
> > > +        uint64_t lsize = nvdimm->label_size;
> > > +        int size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
> > > +                                           NULL);
> > > +
> > > +        spapr_populate_nvdimm_node(fdt, offset, di->node, di->addr,
> > > +                                   size, lsize);
> > It might be cleaner to just pass the NVDIMMDevice * rather than
> > umpteen parameters.
> Ok.
> > > +    }
> > > +    g_slist_free(dimms);
> > > +    return;
> > > +}
> > > +
> > >   static void *spapr_build_fdt(sPAPRMachineState *spapr)
> > >   {
> > >       MachineState *machine = MACHINE(spapr);
> > > @@ -1348,6 +1395,11 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr)
> > >           exit(1);
> > >       }
> > > +    /* NVDIMM devices */
> > > +    if (spapr->nvdimm_enabled) {
> > > +        spapr_create_nvdimm(fdt);
> > > +    }
> > > +
> > >       return fdt;
> > >   }
> > > @@ -3143,6 +3195,20 @@ static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
> > >       }
> > >   }
> > > +static bool spapr_get_nvdimm(Object *obj, Error **errp)
> > > +{
> > > +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> > > +
> > > +    return spapr->nvdimm_enabled;
> > > +}
> > > +
> > > +static void spapr_set_nvdimm(Object *obj, bool value, Error **errp)
> > > +{
> > > +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> > > +
> > > +    spapr->nvdimm_enabled = value;
> > > +}
> > > +
> > >   static void spapr_instance_init(Object *obj)
> > >   {
> > >       sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> > > @@ -3188,6 +3254,11 @@ static void spapr_instance_init(Object *obj)
> > >       object_property_set_description(obj, "ic-mode",
> > >                    "Specifies the interrupt controller mode (xics, xive, dual)",
> > >                    NULL);
> > > +    object_property_add_bool(obj, "nvdimm",
> > > +                            spapr_get_nvdimm, spapr_set_nvdimm, NULL);
> > > +    object_property_set_description(obj, "nvdimm",
> > > +                                    "Enable support for nvdimm devices",
> > > +                                    NULL);
> > I'm not seeing a lot of point to this machine parameter.
> Just followed what the x86 is doing here.

Hmm.  I wonder what the rationale for the property is there.

> > >   }
> > >   static void spapr_machine_finalizefn(Object *obj)
> > > @@ -3267,12 +3338,103 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
> > >       }
> > >   }
> > > +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset, uint32_t node,
> > > +                                      uint64_t addr, uint64_t size,
> > > +                                      uint64_t label_size)
> > > +{
> > > +    int offset;
> > > +    char buf[40];
> > > +    GString *lcode = g_string_sized_new(10);
> > > +    sPAPRDRConnector *drc;
> > > +    QemuUUID uuid;
> > > +    uint32_t drc_idx;
> > > +    uint32_t associativity[] = {
> > > +        cpu_to_be32(0x4), /* length */
> > > +        cpu_to_be32(0x0), cpu_to_be32(0x0),
> > > +        cpu_to_be32(0x0), cpu_to_be32(node)
> > > +    };
> > > +
> > > +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
> > > +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> > > +    g_assert(drc);
> > > +
> > > +    drc_idx = spapr_drc_index(drc);
> > > +
> > > +    sprintf(buf, "pmem@%x", drc_idx);
> > > +    offset = fdt_add_subnode(fdt, fdt_offset, buf);
> > "fdt_offset" vs. "offset" isn't very obvious.  Maybe parent_offset /
> > child_offset or something?
> Ok
> > > +    _FDT(offset);
> > > +
> > > +    _FDT((fdt_setprop_cell(fdt, offset, "reg", drc_idx)));
> > > +    _FDT((fdt_setprop_string(fdt, offset, "compatible", "ibm,pmemory")));
> > > +    _FDT((fdt_setprop_string(fdt, offset, "name", "pmem")));
> > Again, no need to set "name".
> Ok
> > > +    _FDT((fdt_setprop_string(fdt, offset, "device_type", "ibm,pmemory")));
> > > +
> > > +    /*NB : Supposed to be random strings. Currently empty 10 strings! */
> > > +    _FDT((fdt_setprop(fdt, offset, "ibm,loc-code", lcode->str, lcode->len)));
> > > +    g_string_free(lcode, TRUE);
> > I think leaving this property out would be preferable to including it
> > but putting nothing useful there.
> Ok.
> > > +
> > > +    _FDT((fdt_setprop(fdt, offset, "ibm,associativity", associativity,
> > > +                      sizeof(associativity))));
> > > +    g_random_set_seed(drc_idx);
> > > +    qemu_uuid_generate(&uuid);
> > This looks bogus.  I'm guessing the set seed is so that you generate
> > consistent UUIDs for the same NVDIMM in a guest.  First, that's making
> > a lot of assumptions about how qemu_uuid_generate() works that aren't
> > really warranted.  Second, it poisons the RNG for anything running
> > after this which actually wants (pseudo) random numbers.
> > 
> > I think you need to make the UUID a property of the device instead.
> Ok.
> > > +
> > > +    qemu_uuid_unparse(&uuid, buf);
> > > +    _FDT((fdt_setprop_string(fdt, offset, "ibm,unit-guid", buf)));
> > > +
> > > +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_idx)));
> > > +
> > > +    /*NB : What it should be? */
> > > +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,latency-attribute", 828));
> > > +
> > > +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,block-size",
> > > +                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> > > +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,number-of-blocks",
> > > +                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> > > +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,metadata-size", label_size)));
> > > +
> > > +    return offset;
> > > +}
> > > +
> > > +static void spapr_add_nvdimm(DeviceState *dev, uint64_t addr,
> > > +                             uint64_t size, uint32_t node,
> > > +                             Error **errp)
> > > +{
> > > +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_hotplug_handler(dev));
> > > +    sPAPRDRConnector *drc;
> > > +    bool hotplugged = spapr_drc_hotplugged(dev);
> > > +    NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
> > > +    void *fdt;
> > > +    int fdt_offset, fdt_size;
> > > +    Error *local_err = NULL;
> > > +
> > > +    spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM,
> > > +                           addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> > > +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
> > > +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> > > +    g_assert(drc);
> > Creating the DRC in the hotplug path looks bogus.  Generally the DRC
> > has to exist before you can even attempt to plug the device.
> 
> We dont really know how many DRC to create. Unlike memory hotplug
> where we know how many LMBs are required to fit till the maxmem, in this
> case we dont know how many NVDIMM devices  guest can have. That is the
> reason I am creating the DRC on demand. I'll see if it is possible to
> address this
> by putting a cap on maximum number of NVDIMM devices a guest can have.

Urgh, PAPR.  First it specifies a crappy hotplug model that requires
zillions of fixed attachment points to be instantiated, then it breaks
its own model.

But.. I still don't really understand how this works.

a) How does the guest know the DRC index to use for the new NVDIMM?
   Generally that comes from the device tree, but the guest doesn't
   get new device tree information until it calls configure-connector
   for which it needs the DRC index.

b) AFAICT, NVDIMMs would also require HPT space, much like regular
   memory would.  PowerVM doesn't have HPT resizing, so surely it must
   already have some sort of cap on the amount of NVDIMM space in
   order to size the HPT correctly.


> > > +    fdt = create_device_tree(&fdt_size);
> > > +    fdt_offset = spapr_populate_nvdimm_node(fdt, 0, node, addr,
> > > +                                            size, nvdimm->label_size);
> > > +
> > > +    spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
> > > +    if (local_err) {
> > > +        error_propagate(errp, local_err);
> > > +        return;
> > > +    }
> > > +
> > > +    if (hotplugged) {
> > > +        spapr_hotplug_req_add_by_index(drc);
> > > +    }
> > > +}
> > > +
> > >   static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> > >                                 Error **errp)
> > >   {
> > >       Error *local_err = NULL;
> > >       sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
> > >       PCDIMMDevice *dimm = PC_DIMM(dev);
> > > +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
> > >       uint64_t size, addr;
> > >       uint32_t node;
> > > @@ -3291,9 +3453,14 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> > >       node = object_property_get_uint(OBJECT(dev), PC_DIMM_NODE_PROP,
> > >                                       &error_abort);
> > > -    spapr_add_lmbs(dev, addr, size, node,
> > > -                   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> > > -                   &local_err);
> > > +    if (!is_nvdimm) {
> > > +        spapr_add_lmbs(dev, addr, size, node,
> > > +                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> > > +                       &local_err);
> > > +    } else {
> > > +        spapr_add_nvdimm(dev, addr, size, node, &local_err);
> > > +    }
> > > +
> > >       if (local_err) {
> > >           goto out_unplug;
> > >       }
> > > @@ -3311,6 +3478,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> > >   {
> > >       const sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
> > >       sPAPRMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
> > > +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
> > >       PCDIMMDevice *dimm = PC_DIMM(dev);
> > >       Error *local_err = NULL;
> > >       uint64_t size;
> > > @@ -3328,10 +3496,30 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> > >           return;
> > >       }
> > > -    if (size % SPAPR_MEMORY_BLOCK_SIZE) {
> > > +    if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) {
> > >           error_setg(errp, "Hotplugged memory size must be a multiple of "
> > > -                      "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
> > > +                          "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
> > >           return;
> > > +    } else if (is_nvdimm) {
> > > +        NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
> > > +        if ((nvdimm->label_size + size) % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
> > > +            error_setg(errp, "NVDIMM memory size must be a multiple of "
> > > +                       "%" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
> > > +            return;
> > > +        }
> > > +        if (((nvdimm->label_size + size) / SPAPR_MINIMUM_SCM_BLOCK_SIZE) == 1) {
> > > +            error_setg(errp, "NVDIMM size must be atleast "
> > > +                       "%" PRIu64 "MB", 2 * SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
> > > +            return;
> > > +        }
> > > +
> > > +        /* Align to scm block size, exclude the label */
> > > +        memory_device_set_region_size(MEMORY_DEVICE(nvdimm),
> > > +               QEMU_ALIGN_DOWN(size, SPAPR_MINIMUM_SCM_BLOCK_SIZE), &local_err);
> > > +        if (local_err) {
> > > +            error_propagate(errp, local_err);
> > > +            return;
> > > +        }
> > >       }
> > >       memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
> > > diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> > > index 2edb7d1e9c..94ddd102cc 100644
> > > --- a/hw/ppc/spapr_drc.c
> > > +++ b/hw/ppc/spapr_drc.c
> > > @@ -696,6 +696,16 @@ static void spapr_drc_lmb_class_init(ObjectClass *k, void *data)
> > >       drck->release = spapr_lmb_release;
> > >   }
> > > +static void spapr_drc_pmem_class_init(ObjectClass *k, void *data)
> > > +{
> > > +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_CLASS(k);
> > > +
> > > +    drck->typeshift = SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM;
> > > +    drck->typename = "MEM";
> > > +    drck->drc_name_prefix = "PMEM ";
> > > +    drck->release = NULL;
> > > +}
> > > +
> > >   static const TypeInfo spapr_dr_connector_info = {
> > >       .name          = TYPE_SPAPR_DR_CONNECTOR,
> > >       .parent        = TYPE_DEVICE,
> > > @@ -739,6 +749,12 @@ static const TypeInfo spapr_drc_lmb_info = {
> > >       .class_init    = spapr_drc_lmb_class_init,
> > >   };
> > > +static const TypeInfo spapr_drc_pmem_info = {
> > > +    .name          = TYPE_SPAPR_DRC_PMEM,
> > > +    .parent        = TYPE_SPAPR_DRC_LOGICAL,
> > > +    .class_init    = spapr_drc_pmem_class_init,
> > > +};
> > > +
> > >   /* helper functions for external users */
> > >   sPAPRDRConnector *spapr_drc_by_index(uint32_t index)
> > > @@ -1189,6 +1205,7 @@ static void spapr_drc_register_types(void)
> > >       type_register_static(&spapr_drc_cpu_info);
> > >       type_register_static(&spapr_drc_pci_info);
> > >       type_register_static(&spapr_drc_lmb_info);
> > > +    type_register_static(&spapr_drc_pmem_info);
> > >       spapr_rtas_register(RTAS_SET_INDICATOR, "set-indicator",
> > >                           rtas_set_indicator);
> > > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> > > index 32719a1b72..a4fed84346 100644
> > > --- a/hw/ppc/spapr_events.c
> > > +++ b/hw/ppc/spapr_events.c
> > > @@ -193,6 +193,7 @@ struct rtas_event_log_v6_hp {
> > >   #define RTAS_LOG_V6_HP_TYPE_SLOT                         3
> > >   #define RTAS_LOG_V6_HP_TYPE_PHB                          4
> > >   #define RTAS_LOG_V6_HP_TYPE_PCI                          5
> > > +#define RTAS_LOG_V6_HP_TYPE_PMEM                         6
> > >       uint8_t hotplug_action;
> > >   #define RTAS_LOG_V6_HP_ACTION_ADD                        1
> > >   #define RTAS_LOG_V6_HP_ACTION_REMOVE                     2
> > > @@ -526,6 +527,9 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
> > >       case SPAPR_DR_CONNECTOR_TYPE_CPU:
> > >           hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_CPU;
> > >           break;
> > > +    case SPAPR_DR_CONNECTOR_TYPE_PMEM:
> > > +        hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PMEM;
> > > +        break;
> > >       default:
> > >           /* we shouldn't be signaling hotplug events for resources
> > >            * that don't support them
> > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > > index a947a0a0dc..21a9709afe 100644
> > > --- a/include/hw/ppc/spapr.h
> > > +++ b/include/hw/ppc/spapr.h
> > > @@ -187,6 +187,7 @@ struct sPAPRMachineState {
> > >       bool cmd_line_caps[SPAPR_CAP_NUM];
> > >       sPAPRCapabilities def, eff, mig;
> > > +    bool nvdimm_enabled;
> > >   };
> > >   #define H_SUCCESS         0
> > > @@ -798,6 +799,15 @@ int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
> > >   #define SPAPR_LMB_FLAGS_DRC_INVALID 0x00000020
> > >   #define SPAPR_LMB_FLAGS_RESERVED 0x00000080
> > > +/*
> > > + * The nvdimm size should be aligned to SCM block size.
> > > + * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
> > > + * inorder to have SCM regions not to overlap with dimm memory regions.
> > > + * The SCM devices can have variable block sizes. For now, fixing the
> > > + * block size to the minimum value.
> > > + */
> > > +#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
> > > +
> > >   void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg);
> > >   #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
> > > diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
> > > index f6ff32e7e2..65925d00b1 100644
> > > --- a/include/hw/ppc/spapr_drc.h
> > > +++ b/include/hw/ppc/spapr_drc.h
> > > @@ -70,6 +70,13 @@
> > >   #define SPAPR_DRC_LMB(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
> > >                                           TYPE_SPAPR_DRC_LMB)
> > > +#define TYPE_SPAPR_DRC_PMEM "spapr-drc-pmem"
> > > +#define SPAPR_DRC_PMEM_GET_CLASS(obj) \
> > > +        OBJECT_GET_CLASS(sPAPRDRConnectorClass, obj, TYPE_SPAPR_DRC_PMEM)
> > > +#define SPAPR_DRC_PMEM_CLASS(klass) \
> > > +        OBJECT_CLASS_CHECK(sPAPRDRConnectorClass, klass, TYPE_SPAPR_DRC_PMEM)
> > > +#define SPAPR_DRC_PMEM(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
> > > +                                        TYPE_SPAPR_DRC_PMEM)
> > >   /*
> > >    * Various hotplug types managed by sPAPRDRConnector
> > >    *
> > > @@ -87,6 +94,7 @@ typedef enum {
> > >       SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO = 3,
> > >       SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI = 4,
> > >       SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB = 8,
> > > +    SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM = 9,
> > >   } sPAPRDRConnectorTypeShift;
> > >   typedef enum {
> > > @@ -96,6 +104,7 @@ typedef enum {
> > >       SPAPR_DR_CONNECTOR_TYPE_VIO = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO,
> > >       SPAPR_DR_CONNECTOR_TYPE_PCI = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI,
> > >       SPAPR_DR_CONNECTOR_TYPE_LMB = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB,
> > > +    SPAPR_DR_CONNECTOR_TYPE_PMEM = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM,
> > >   } sPAPRDRConnectorType;
> > >   /*
> > > 
>
Shivaprasad G Bhat Feb. 18, 2019, 4:15 p.m. UTC | #4
On 02/18/2019 04:32 AM, David Gibson wrote:
> On Fri, Feb 15, 2019 at 04:41:09PM +0530, Shivaprasad G Bhat wrote:
>> Thanks for the comments David. Please find my replies inline..
>>
>>
>> On 02/12/2019 07:19 AM, David Gibson wrote:
>>> On Tue, Feb 05, 2019 at 11:26:27PM -0600, Shivaprasad G Bhat wrote:
>>>> Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
>>>> device interface in QEMU to support virtual NVDIMM devices for Power (May have
>>>> to re-look at this later).  Create the required DT entries for the
>>>> device (some entries have dummy values right now).
>>>>
>>>> The patch creates the required DT node and sends a hotplug
>>>> interrupt to the guest. Guest is expected to undertake the normal
>>>> DR resource add path in response and start issuing PAPR SCM hcalls.
>>>>
>>>> This is how it can be used ..
>>>> Add nvdimm=on to the qemu machine argument.
>>>> Ex : -machine pseries,nvdimm=on
>>>> For coldplug, the device to be added in qemu command line as shown below
>>>> -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
>>>> -device nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
>>>>
>>>> For hotplug, the device to be added from monitor as below
>>>> object_add memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
>>>> device_add nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
>>>>
>>>> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
>>>> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
>>>>                  [Early implementation]
>>>> ---
>>>>    default-configs/ppc64-softmmu.mak |    1
>>>>    hw/ppc/spapr.c                    |  212 +++++++++++++++++++++++++++++++++++--
>>>>    hw/ppc/spapr_drc.c                |   17 +++
>>>>    hw/ppc/spapr_events.c             |    4 +
>>>>    include/hw/ppc/spapr.h            |   10 ++
>>>>    include/hw/ppc/spapr_drc.h        |    9 ++
>>>>    6 files changed, 241 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
>>>> index 7f34ad0528..b6e1aa5125 100644
>>>> --- a/default-configs/ppc64-softmmu.mak
>>>> +++ b/default-configs/ppc64-softmmu.mak
>>>> @@ -20,4 +20,5 @@ CONFIG_XIVE=$(CONFIG_PSERIES)
>>>>    CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
>>>>    CONFIG_MEM_DEVICE=y
>>>>    CONFIG_DIMM=y
>>>> +CONFIG_NVDIMM=y
>>>>    CONFIG_SPAPR_RNG=y
>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>> index 0fcdd35cbe..7e7a1a8041 100644
>>>> --- a/hw/ppc/spapr.c
>>>> +++ b/hw/ppc/spapr.c
>>>> @@ -73,6 +73,7 @@
>>>>    #include "qemu/cutils.h"
>>>>    #include "hw/ppc/spapr_cpu_core.h"
>>>>    #include "hw/mem/memory-device.h"
>>>> +#include "hw/mem/nvdimm.h"
>>>>    #include <libfdt.h>
>>>> @@ -690,6 +691,7 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>>>>        uint8_t *int_buf, *cur_index, buf_len;
>>>>        int ret;
>>>>        uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
>>>> +    uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
>>>>        uint64_t addr, cur_addr, size;
>>>>        uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
>>>>        uint64_t mem_end = machine->device_memory->base +
>>>> @@ -726,15 +728,24 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>>>>                nr_entries++;
>>>>            }
>>>> -        /* Entry for DIMM */
>>>> -        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
>>>> -        g_assert(drc);
>>>> -        elem = spapr_get_drconf_cell(size / lmb_size, addr,
>>>> -                                     spapr_drc_index(drc), node,
>>>> -                                     SPAPR_LMB_FLAGS_ASSIGNED);
>>>> +        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
>>>> +            /* Entry for NVDIMM */
>>>> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, addr / scm_block_size);
>>>> +            g_assert(drc);
>>>> +            elem = spapr_get_drconf_cell(size / scm_block_size, addr,
>>>> +                                         spapr_drc_index(drc), -1, 0);
>>>> +            cur_addr = ROUND_UP(addr + size, scm_block_size);
>>>> +        } else {
>>>> +            /* Entry for DIMM */
>>>> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
>>>> +            g_assert(drc);
>>>> +            elem = spapr_get_drconf_cell(size / lmb_size, addr,
>>>> +                                         spapr_drc_index(drc), node,
>>>> +                                         SPAPR_LMB_FLAGS_ASSIGNED);
>>>> +            cur_addr = addr + size;
>>>> +        }
>>>>            QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
>>>>            nr_entries++;
>>>> -        cur_addr = addr + size;
>>>>        }
>>>>        /* Entry for remaining hotpluggable area */
>>>> @@ -1225,6 +1236,42 @@ static void spapr_dt_hypervisor(sPAPRMachineState *spapr, void *fdt)
>>>>        }
>>>>    }
>>>> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset,
>>>> +                                      uint32_t node, uint64_t addr,
>>>> +                                      uint64_t size, uint64_t label_size);
>>> Re-ordering the code is generally preferred to static forward declarations.
>> Ok
>>>> +static void spapr_create_nvdimm(void *fdt)
>>> I'm trying to standardize on spapr_dt_*() for functions which generate
>>> bits of the device tree.
>> Ok. Will rename to spapr_dt_create_nvdimm
> Just spapr_dt_nvdimm() would be preferred.
Ok.
>
>>>> +{
>>>> +    int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
>>>> +    GSList *dimms = NULL;
>>>> +
>>>> +    if (offset < 0) {
>>>> +        offset = fdt_add_subnode(fdt, 0, "persistent-memory");
>>>> +        _FDT(offset);
>>>> +        _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x2)));
>>>> +        _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
>>>> +        _FDT((fdt_setprop_string(fdt, offset, "name", "persistent-memory")));
>>> No need to explicitly set the "name" property, that's implicit in the
>>> node name.
>> Ok
>>>> +        _FDT((fdt_setprop_string(fdt, offset, "device_type",
>>>> +                                 "ibm,persistent-memory")));
>>>> +    }
>>>> +
>>>> +    /*NB : Add drc-info array here */
>>>> +
>>>> +    /* Create DT entries for cold plugged NVDIMM devices */
>>>> +    dimms = nvdimm_get_device_list();
>>>> +    for (; dimms; dimms = dimms->next) {
>>>> +        NVDIMMDevice *nvdimm = dimms->data;
>>>> +        PCDIMMDevice *di = PC_DIMM(nvdimm);
>>>> +        uint64_t lsize = nvdimm->label_size;
>>>> +        int size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
>>>> +                                           NULL);
>>>> +
>>>> +        spapr_populate_nvdimm_node(fdt, offset, di->node, di->addr,
>>>> +                                   size, lsize);
>>> It might be cleaner to just pass the NVDIMMDevice * rather than
>>> umpteen parameters.
>> Ok.
>>>> +    }
>>>> +    g_slist_free(dimms);
>>>> +    return;
>>>> +}
>>>> +
>>>>    static void *spapr_build_fdt(sPAPRMachineState *spapr)
>>>>    {
>>>>        MachineState *machine = MACHINE(spapr);
>>>> @@ -1348,6 +1395,11 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr)
>>>>            exit(1);
>>>>        }
>>>> +    /* NVDIMM devices */
>>>> +    if (spapr->nvdimm_enabled) {
>>>> +        spapr_create_nvdimm(fdt);
>>>> +    }
>>>> +
>>>>        return fdt;
>>>>    }
>>>> @@ -3143,6 +3195,20 @@ static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
>>>>        }
>>>>    }
>>>> +static bool spapr_get_nvdimm(Object *obj, Error **errp)
>>>> +{
>>>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>>>> +
>>>> +    return spapr->nvdimm_enabled;
>>>> +}
>>>> +
>>>> +static void spapr_set_nvdimm(Object *obj, bool value, Error **errp)
>>>> +{
>>>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>>>> +
>>>> +    spapr->nvdimm_enabled = value;
>>>> +}
>>>> +
>>>>    static void spapr_instance_init(Object *obj)
>>>>    {
>>>>        sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>>>> @@ -3188,6 +3254,11 @@ static void spapr_instance_init(Object *obj)
>>>>        object_property_set_description(obj, "ic-mode",
>>>>                     "Specifies the interrupt controller mode (xics, xive, dual)",
>>>>                     NULL);
>>>> +    object_property_add_bool(obj, "nvdimm",
>>>> +                            spapr_get_nvdimm, spapr_set_nvdimm, NULL);
>>>> +    object_property_set_description(obj, "nvdimm",
>>>> +                                    "Enable support for nvdimm devices",
>>>> +                                    NULL);
>>> I'm not seeing a lot of point to this machine parameter.
>> Just followed what the x86 is doing here.
> Hmm.  I wonder what the rationale for the property is there.
>
>>>>    }
>>>>    static void spapr_machine_finalizefn(Object *obj)
>>>> @@ -3267,12 +3338,103 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
>>>>        }
>>>>    }
>>>> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset, uint32_t node,
>>>> +                                      uint64_t addr, uint64_t size,
>>>> +                                      uint64_t label_size)
>>>> +{
>>>> +    int offset;
>>>> +    char buf[40];
>>>> +    GString *lcode = g_string_sized_new(10);
>>>> +    sPAPRDRConnector *drc;
>>>> +    QemuUUID uuid;
>>>> +    uint32_t drc_idx;
>>>> +    uint32_t associativity[] = {
>>>> +        cpu_to_be32(0x4), /* length */
>>>> +        cpu_to_be32(0x0), cpu_to_be32(0x0),
>>>> +        cpu_to_be32(0x0), cpu_to_be32(node)
>>>> +    };
>>>> +
>>>> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
>>>> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>>>> +    g_assert(drc);
>>>> +
>>>> +    drc_idx = spapr_drc_index(drc);
>>>> +
>>>> +    sprintf(buf, "pmem@%x", drc_idx);
>>>> +    offset = fdt_add_subnode(fdt, fdt_offset, buf);
>>> "fdt_offset" vs. "offset" isn't very obvious.  Maybe parent_offset /
>>> child_offset or something?
>> Ok
>>>> +    _FDT(offset);
>>>> +
>>>> +    _FDT((fdt_setprop_cell(fdt, offset, "reg", drc_idx)));
>>>> +    _FDT((fdt_setprop_string(fdt, offset, "compatible", "ibm,pmemory")));
>>>> +    _FDT((fdt_setprop_string(fdt, offset, "name", "pmem")));
>>> Again, no need to set "name".
>> Ok
>>>> +    _FDT((fdt_setprop_string(fdt, offset, "device_type", "ibm,pmemory")));
>>>> +
>>>> +    /*NB : Supposed to be random strings. Currently empty 10 strings! */
>>>> +    _FDT((fdt_setprop(fdt, offset, "ibm,loc-code", lcode->str, lcode->len)));
>>>> +    g_string_free(lcode, TRUE);
>>> I think leaving this property out would be preferable to including it
>>> but putting nothing useful there.
>> Ok.
>>>> +
>>>> +    _FDT((fdt_setprop(fdt, offset, "ibm,associativity", associativity,
>>>> +                      sizeof(associativity))));
>>>> +    g_random_set_seed(drc_idx);
>>>> +    qemu_uuid_generate(&uuid);
>>> This looks bogus.  I'm guessing the set seed is so that you generate
>>> consistent UUIDs for the same NVDIMM in a guest.  First, that's making
>>> a lot of assumptions about how qemu_uuid_generate() works that aren't
>>> really warranted.  Second, it poisons the RNG for anything running
>>> after this which actually wants (pseudo) random numbers.
>>>
>>> I think you need to make the UUID a property of the device instead.
>> Ok.
>>>> +
>>>> +    qemu_uuid_unparse(&uuid, buf);
>>>> +    _FDT((fdt_setprop_string(fdt, offset, "ibm,unit-guid", buf)));
>>>> +
>>>> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_idx)));
>>>> +
>>>> +    /*NB : What it should be? */
>>>> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,latency-attribute", 828));
>>>> +
>>>> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,block-size",
>>>> +                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
>>>> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,number-of-blocks",
>>>> +                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
>>>> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,metadata-size", label_size)));
>>>> +
>>>> +    return offset;
>>>> +}
>>>> +
>>>> +static void spapr_add_nvdimm(DeviceState *dev, uint64_t addr,
>>>> +                             uint64_t size, uint32_t node,
>>>> +                             Error **errp)
>>>> +{
>>>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_hotplug_handler(dev));
>>>> +    sPAPRDRConnector *drc;
>>>> +    bool hotplugged = spapr_drc_hotplugged(dev);
>>>> +    NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
>>>> +    void *fdt;
>>>> +    int fdt_offset, fdt_size;
>>>> +    Error *local_err = NULL;
>>>> +
>>>> +    spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM,
>>>> +                           addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>>>> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
>>>> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>>>> +    g_assert(drc);
>>> Creating the DRC in the hotplug path looks bogus.  Generally the DRC
>>> has to exist before you can even attempt to plug the device.
>> We dont really know how many DRC to create. Unlike memory hotplug
>> where we know how many LMBs are required to fit till the maxmem, in this
>> case we dont know how many NVDIMM devices  guest can have. That is the
>> reason I am creating the DRC on demand. I'll see if it is possible to
>> address this
>> by putting a cap on maximum number of NVDIMM devices a guest can have.
> Urgh, PAPR.  First it specifies a crappy hotplug model that requires
> zillions of fixed attachment points to be instantiated, then it breaks
> its own model.
>
> But.. I still don't really understand how this works.
>
> a) How does the guest know the DRC index to use for the new NVDIMM?
>     Generally that comes from the device tree, but the guest doesn't
>     get new device tree information until it calls configure-connector
>     for which it needs the DRC index.
The DRC is passed in the device tree blob passed as payload of hotplug 
interrupt
from which the guest picks the DRC index and makes the subsequent calls.
> b) AFAICT, NVDIMMs would also require HPT space, much like regular
>     memory would.  PowerVM doesn't have HPT resizing, so surely it must
>     already have some sort of cap on the amount of NVDIMM space in
>     order to size the HPT correctly.
On Power KVM we will enforce the NVDIMM is mapped within the maxmem,
however the spec allows outside of it. Coming back to the original point of
creating the DRCs at the hotplug time, we could impose a limit on the
number of NVDIMM devices that could be hotplugged so that we can
create the DRCs at the machine init time.
>>>> +    fdt = create_device_tree(&fdt_size);
>>>> +    fdt_offset = spapr_populate_nvdimm_node(fdt, 0, node, addr,
>>>> +                                            size, nvdimm->label_size);
>>>> +
>>>> +    spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
>>>> +    if (local_err) {
>>>> +        error_propagate(errp, local_err);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    if (hotplugged) {
>>>> +        spapr_hotplug_req_add_by_index(drc);
>>>> +    }
>>>> +}
>>>> +
>>>>    static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>>>                                  Error **errp)
>>>>    {
>>>>        Error *local_err = NULL;
>>>>        sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
>>>>        PCDIMMDevice *dimm = PC_DIMM(dev);
>>>> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>>>>        uint64_t size, addr;
>>>>        uint32_t node;
>>>> @@ -3291,9 +3453,14 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>>>        node = object_property_get_uint(OBJECT(dev), PC_DIMM_NODE_PROP,
>>>>                                        &error_abort);
>>>> -    spapr_add_lmbs(dev, addr, size, node,
>>>> -                   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
>>>> -                   &local_err);
>>>> +    if (!is_nvdimm) {
>>>> +        spapr_add_lmbs(dev, addr, size, node,
>>>> +                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
>>>> +                       &local_err);
>>>> +    } else {
>>>> +        spapr_add_nvdimm(dev, addr, size, node, &local_err);
>>>> +    }
>>>> +
>>>>        if (local_err) {
>>>>            goto out_unplug;
>>>>        }
>>>> @@ -3311,6 +3478,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>>>    {
>>>>        const sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
>>>>        sPAPRMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
>>>> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>>>>        PCDIMMDevice *dimm = PC_DIMM(dev);
>>>>        Error *local_err = NULL;
>>>>        uint64_t size;
>>>> @@ -3328,10 +3496,30 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>>>            return;
>>>>        }
>>>> -    if (size % SPAPR_MEMORY_BLOCK_SIZE) {
>>>> +    if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) {
>>>>            error_setg(errp, "Hotplugged memory size must be a multiple of "
>>>> -                      "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
>>>> +                          "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
>>>>            return;
>>>> +    } else if (is_nvdimm) {
>>>> +        NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
>>>> +        if ((nvdimm->label_size + size) % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
>>>> +            error_setg(errp, "NVDIMM memory size must be a multiple of "
>>>> +                       "%" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
>>>> +            return;
>>>> +        }
>>>> +        if (((nvdimm->label_size + size) / SPAPR_MINIMUM_SCM_BLOCK_SIZE) == 1) {
>>>> +            error_setg(errp, "NVDIMM size must be atleast "
>>>> +                       "%" PRIu64 "MB", 2 * SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
>>>> +            return;
>>>> +        }
>>>> +
>>>> +        /* Align to scm block size, exclude the label */
>>>> +        memory_device_set_region_size(MEMORY_DEVICE(nvdimm),
>>>> +               QEMU_ALIGN_DOWN(size, SPAPR_MINIMUM_SCM_BLOCK_SIZE), &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            return;
>>>> +        }
>>>>        }
>>>>        memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
>>>> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
>>>> index 2edb7d1e9c..94ddd102cc 100644
>>>> --- a/hw/ppc/spapr_drc.c
>>>> +++ b/hw/ppc/spapr_drc.c
>>>> @@ -696,6 +696,16 @@ static void spapr_drc_lmb_class_init(ObjectClass *k, void *data)
>>>>        drck->release = spapr_lmb_release;
>>>>    }
>>>> +static void spapr_drc_pmem_class_init(ObjectClass *k, void *data)
>>>> +{
>>>> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_CLASS(k);
>>>> +
>>>> +    drck->typeshift = SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM;
>>>> +    drck->typename = "MEM";
>>>> +    drck->drc_name_prefix = "PMEM ";
>>>> +    drck->release = NULL;
>>>> +}
>>>> +
>>>>    static const TypeInfo spapr_dr_connector_info = {
>>>>        .name          = TYPE_SPAPR_DR_CONNECTOR,
>>>>        .parent        = TYPE_DEVICE,
>>>> @@ -739,6 +749,12 @@ static const TypeInfo spapr_drc_lmb_info = {
>>>>        .class_init    = spapr_drc_lmb_class_init,
>>>>    };
>>>> +static const TypeInfo spapr_drc_pmem_info = {
>>>> +    .name          = TYPE_SPAPR_DRC_PMEM,
>>>> +    .parent        = TYPE_SPAPR_DRC_LOGICAL,
>>>> +    .class_init    = spapr_drc_pmem_class_init,
>>>> +};
>>>> +
>>>>    /* helper functions for external users */
>>>>    sPAPRDRConnector *spapr_drc_by_index(uint32_t index)
>>>> @@ -1189,6 +1205,7 @@ static void spapr_drc_register_types(void)
>>>>        type_register_static(&spapr_drc_cpu_info);
>>>>        type_register_static(&spapr_drc_pci_info);
>>>>        type_register_static(&spapr_drc_lmb_info);
>>>> +    type_register_static(&spapr_drc_pmem_info);
>>>>        spapr_rtas_register(RTAS_SET_INDICATOR, "set-indicator",
>>>>                            rtas_set_indicator);
>>>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>>>> index 32719a1b72..a4fed84346 100644
>>>> --- a/hw/ppc/spapr_events.c
>>>> +++ b/hw/ppc/spapr_events.c
>>>> @@ -193,6 +193,7 @@ struct rtas_event_log_v6_hp {
>>>>    #define RTAS_LOG_V6_HP_TYPE_SLOT                         3
>>>>    #define RTAS_LOG_V6_HP_TYPE_PHB                          4
>>>>    #define RTAS_LOG_V6_HP_TYPE_PCI                          5
>>>> +#define RTAS_LOG_V6_HP_TYPE_PMEM                         6
>>>>        uint8_t hotplug_action;
>>>>    #define RTAS_LOG_V6_HP_ACTION_ADD                        1
>>>>    #define RTAS_LOG_V6_HP_ACTION_REMOVE                     2
>>>> @@ -526,6 +527,9 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
>>>>        case SPAPR_DR_CONNECTOR_TYPE_CPU:
>>>>            hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_CPU;
>>>>            break;
>>>> +    case SPAPR_DR_CONNECTOR_TYPE_PMEM:
>>>> +        hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PMEM;
>>>> +        break;
>>>>        default:
>>>>            /* we shouldn't be signaling hotplug events for resources
>>>>             * that don't support them
>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>> index a947a0a0dc..21a9709afe 100644
>>>> --- a/include/hw/ppc/spapr.h
>>>> +++ b/include/hw/ppc/spapr.h
>>>> @@ -187,6 +187,7 @@ struct sPAPRMachineState {
>>>>        bool cmd_line_caps[SPAPR_CAP_NUM];
>>>>        sPAPRCapabilities def, eff, mig;
>>>> +    bool nvdimm_enabled;
>>>>    };
>>>>    #define H_SUCCESS         0
>>>> @@ -798,6 +799,15 @@ int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
>>>>    #define SPAPR_LMB_FLAGS_DRC_INVALID 0x00000020
>>>>    #define SPAPR_LMB_FLAGS_RESERVED 0x00000080
>>>> +/*
>>>> + * The nvdimm size should be aligned to SCM block size.
>>>> + * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
>>>> + * inorder to have SCM regions not to overlap with dimm memory regions.
>>>> + * The SCM devices can have variable block sizes. For now, fixing the
>>>> + * block size to the minimum value.
>>>> + */
>>>> +#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
>>>> +
>>>>    void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg);
>>>>    #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
>>>> diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
>>>> index f6ff32e7e2..65925d00b1 100644
>>>> --- a/include/hw/ppc/spapr_drc.h
>>>> +++ b/include/hw/ppc/spapr_drc.h
>>>> @@ -70,6 +70,13 @@
>>>>    #define SPAPR_DRC_LMB(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
>>>>                                            TYPE_SPAPR_DRC_LMB)
>>>> +#define TYPE_SPAPR_DRC_PMEM "spapr-drc-pmem"
>>>> +#define SPAPR_DRC_PMEM_GET_CLASS(obj) \
>>>> +        OBJECT_GET_CLASS(sPAPRDRConnectorClass, obj, TYPE_SPAPR_DRC_PMEM)
>>>> +#define SPAPR_DRC_PMEM_CLASS(klass) \
>>>> +        OBJECT_CLASS_CHECK(sPAPRDRConnectorClass, klass, TYPE_SPAPR_DRC_PMEM)
>>>> +#define SPAPR_DRC_PMEM(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
>>>> +                                        TYPE_SPAPR_DRC_PMEM)
>>>>    /*
>>>>     * Various hotplug types managed by sPAPRDRConnector
>>>>     *
>>>> @@ -87,6 +94,7 @@ typedef enum {
>>>>        SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO = 3,
>>>>        SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI = 4,
>>>>        SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB = 8,
>>>> +    SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM = 9,
>>>>    } sPAPRDRConnectorTypeShift;
>>>>    typedef enum {
>>>> @@ -96,6 +104,7 @@ typedef enum {
>>>>        SPAPR_DR_CONNECTOR_TYPE_VIO = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO,
>>>>        SPAPR_DR_CONNECTOR_TYPE_PCI = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI,
>>>>        SPAPR_DR_CONNECTOR_TYPE_LMB = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB,
>>>> +    SPAPR_DR_CONNECTOR_TYPE_PMEM = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM,
>>>>    } sPAPRDRConnectorType;
>>>>    /*
>>>>
Igor Mammedov Feb. 19, 2019, 8:11 a.m. UTC | #5
On Tue, 05 Feb 2019 23:26:27 -0600
Shivaprasad G Bhat <sbhat@linux.ibm.com> wrote:

> Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
> device interface in QEMU to support virtual NVDIMM devices for Power (May have
> to re-look at this later).  Create the required DT entries for the
> device (some entries have dummy values right now).
> 
> The patch creates the required DT node and sends a hotplug
> interrupt to the guest. Guest is expected to undertake the normal
> DR resource add path in response and start issuing PAPR SCM hcalls.
> 
> This is how it can be used ..
> Add nvdimm=on to the qemu machine argument.
> Ex : -machine pseries,nvdimm=on
> For coldplug, the device to be added in qemu command line as shown below
> -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
> -device nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
> 
> For hotplug, the device to be added from monitor as below
> object_add memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
> device_add nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
> 
> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
>                [Early implementation]
> ---
>  default-configs/ppc64-softmmu.mak |    1 
>  hw/ppc/spapr.c                    |  212 +++++++++++++++++++++++++++++++++++--
>  hw/ppc/spapr_drc.c                |   17 +++
>  hw/ppc/spapr_events.c             |    4 +
>  include/hw/ppc/spapr.h            |   10 ++
>  include/hw/ppc/spapr_drc.h        |    9 ++
>  6 files changed, 241 insertions(+), 12 deletions(-)
> 
> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> index 7f34ad0528..b6e1aa5125 100644
> --- a/default-configs/ppc64-softmmu.mak
> +++ b/default-configs/ppc64-softmmu.mak
> @@ -20,4 +20,5 @@ CONFIG_XIVE=$(CONFIG_PSERIES)
>  CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
>  CONFIG_MEM_DEVICE=y
>  CONFIG_DIMM=y
> +CONFIG_NVDIMM=y
>  CONFIG_SPAPR_RNG=y
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 0fcdd35cbe..7e7a1a8041 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -73,6 +73,7 @@
>  #include "qemu/cutils.h"
>  #include "hw/ppc/spapr_cpu_core.h"
>  #include "hw/mem/memory-device.h"
> +#include "hw/mem/nvdimm.h"
>  
>  #include <libfdt.h>
>  
> @@ -690,6 +691,7 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>      uint8_t *int_buf, *cur_index, buf_len;
>      int ret;
>      uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> +    uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
>      uint64_t addr, cur_addr, size;
>      uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
>      uint64_t mem_end = machine->device_memory->base +
> @@ -726,15 +728,24 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>              nr_entries++;
>          }
>  
> -        /* Entry for DIMM */
> -        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> -        g_assert(drc);
> -        elem = spapr_get_drconf_cell(size / lmb_size, addr,
> -                                     spapr_drc_index(drc), node,
> -                                     SPAPR_LMB_FLAGS_ASSIGNED);
> +        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
> +            /* Entry for NVDIMM */
> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, addr / scm_block_size);
> +            g_assert(drc);
> +            elem = spapr_get_drconf_cell(size / scm_block_size, addr,
> +                                         spapr_drc_index(drc), -1, 0);
> +            cur_addr = ROUND_UP(addr + size, scm_block_size);
> +        } else {
> +            /* Entry for DIMM */
> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> +            g_assert(drc);
> +            elem = spapr_get_drconf_cell(size / lmb_size, addr,
> +                                         spapr_drc_index(drc), node,
> +                                         SPAPR_LMB_FLAGS_ASSIGNED);
> +            cur_addr = addr + size;
> +        }
>          QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
>          nr_entries++;
> -        cur_addr = addr + size;
>      }
>  
>      /* Entry for remaining hotpluggable area */
> @@ -1225,6 +1236,42 @@ static void spapr_dt_hypervisor(sPAPRMachineState *spapr, void *fdt)
>      }
>  }
>  
> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset,
> +                                      uint32_t node, uint64_t addr,
> +                                      uint64_t size, uint64_t label_size);
> +static void spapr_create_nvdimm(void *fdt)
> +{
> +    int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
> +    GSList *dimms = NULL;
> +
> +    if (offset < 0) {
> +        offset = fdt_add_subnode(fdt, 0, "persistent-memory");
> +        _FDT(offset);
> +        _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x2)));
> +        _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
> +        _FDT((fdt_setprop_string(fdt, offset, "name", "persistent-memory")));
> +        _FDT((fdt_setprop_string(fdt, offset, "device_type",
> +                                 "ibm,persistent-memory")));
> +    }
> +
> +    /*NB : Add drc-info array here */
> +
> +    /* Create DT entries for cold plugged NVDIMM devices */
> +    dimms = nvdimm_get_device_list();
> +    for (; dimms; dimms = dimms->next) {
> +        NVDIMMDevice *nvdimm = dimms->data;
> +        PCDIMMDevice *di = PC_DIMM(nvdimm);
> +        uint64_t lsize = nvdimm->label_size;
> +        int size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
> +                                           NULL);
> +
> +        spapr_populate_nvdimm_node(fdt, offset, di->node, di->addr,
> +                                   size, lsize);
> +    }
> +    g_slist_free(dimms);
> +    return;
> +}
> +
>  static void *spapr_build_fdt(sPAPRMachineState *spapr)
>  {
>      MachineState *machine = MACHINE(spapr);
> @@ -1348,6 +1395,11 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr)
>          exit(1);
>      }
>  
> +    /* NVDIMM devices */
> +    if (spapr->nvdimm_enabled) {
> +        spapr_create_nvdimm(fdt);
> +    }
> +
>      return fdt;
>  }
>  
> @@ -3143,6 +3195,20 @@ static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
>      }
>  }
>  
> +static bool spapr_get_nvdimm(Object *obj, Error **errp)
> +{
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    return spapr->nvdimm_enabled;
> +}
> +
> +static void spapr_set_nvdimm(Object *obj, bool value, Error **errp)
> +{
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> +
> +    spapr->nvdimm_enabled = value;
> +}
> +
>  static void spapr_instance_init(Object *obj)
>  {
>      sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> @@ -3188,6 +3254,11 @@ static void spapr_instance_init(Object *obj)
>      object_property_set_description(obj, "ic-mode",
>                   "Specifies the interrupt controller mode (xics, xive, dual)",
>                   NULL);
> +    object_property_add_bool(obj, "nvdimm",
> +                            spapr_get_nvdimm, spapr_set_nvdimm, NULL);
> +    object_property_set_description(obj, "nvdimm",
> +                                    "Enable support for nvdimm devices",
> +                                    NULL);
>  }
>  
>  static void spapr_machine_finalizefn(Object *obj)
> @@ -3267,12 +3338,103 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
>      }
>  }
>  
> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset, uint32_t node,
> +                                      uint64_t addr, uint64_t size,
> +                                      uint64_t label_size)
> +{
> +    int offset;
> +    char buf[40];
> +    GString *lcode = g_string_sized_new(10);
> +    sPAPRDRConnector *drc;
> +    QemuUUID uuid;
> +    uint32_t drc_idx;
> +    uint32_t associativity[] = {
> +        cpu_to_be32(0x4), /* length */
> +        cpu_to_be32(0x0), cpu_to_be32(0x0),
> +        cpu_to_be32(0x0), cpu_to_be32(node)
> +    };
> +
> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> +    g_assert(drc);
> +
> +    drc_idx = spapr_drc_index(drc);
> +
> +    sprintf(buf, "pmem@%x", drc_idx);
> +    offset = fdt_add_subnode(fdt, fdt_offset, buf);
> +    _FDT(offset);
> +
> +    _FDT((fdt_setprop_cell(fdt, offset, "reg", drc_idx)));
> +    _FDT((fdt_setprop_string(fdt, offset, "compatible", "ibm,pmemory")));
> +    _FDT((fdt_setprop_string(fdt, offset, "name", "pmem")));
> +    _FDT((fdt_setprop_string(fdt, offset, "device_type", "ibm,pmemory")));
> +
> +    /*NB : Supposed to be random strings. Currently empty 10 strings! */
> +    _FDT((fdt_setprop(fdt, offset, "ibm,loc-code", lcode->str, lcode->len)));
> +    g_string_free(lcode, TRUE);
> +
> +    _FDT((fdt_setprop(fdt, offset, "ibm,associativity", associativity,
> +                      sizeof(associativity))));
> +    g_random_set_seed(drc_idx);
> +    qemu_uuid_generate(&uuid);
> +
> +    qemu_uuid_unparse(&uuid, buf);
> +    _FDT((fdt_setprop_string(fdt, offset, "ibm,unit-guid", buf)));
> +
> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_idx)));
> +
> +    /*NB : What it should be? */
> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,latency-attribute", 828));
> +
> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,block-size",
> +                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,number-of-blocks",
> +                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,metadata-size", label_size)));
> +
> +    return offset;
> +}
> +
> +static void spapr_add_nvdimm(DeviceState *dev, uint64_t addr,
> +                             uint64_t size, uint32_t node,
> +                             Error **errp)
> +{
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_hotplug_handler(dev));
> +    sPAPRDRConnector *drc;
> +    bool hotplugged = spapr_drc_hotplugged(dev);
> +    NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
> +    void *fdt;
> +    int fdt_offset, fdt_size;
> +    Error *local_err = NULL;
> +
> +    spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM,
> +                           addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> +    g_assert(drc);
> +
> +    fdt = create_device_tree(&fdt_size);
> +    fdt_offset = spapr_populate_nvdimm_node(fdt, 0, node, addr,
> +                                            size, nvdimm->label_size);
> +
> +    spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
> +    if (hotplugged) {
> +        spapr_hotplug_req_add_by_index(drc);
> +    }
> +}
> +
>  static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>                                Error **errp)
>  {
>      Error *local_err = NULL;
>      sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
>      PCDIMMDevice *dimm = PC_DIMM(dev);
> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>      uint64_t size, addr;
>      uint32_t node;
>  
> @@ -3291,9 +3453,14 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>  
>      node = object_property_get_uint(OBJECT(dev), PC_DIMM_NODE_PROP,
>                                      &error_abort);
> -    spapr_add_lmbs(dev, addr, size, node,
> -                   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> -                   &local_err);
> +    if (!is_nvdimm) {
> +        spapr_add_lmbs(dev, addr, size, node,
> +                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> +                       &local_err);
> +    } else {
> +        spapr_add_nvdimm(dev, addr, size, node, &local_err);
> +    }
> +
>      if (local_err) {
>          goto out_unplug;
>      }
> @@ -3311,6 +3478,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>  {
>      const sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
>      sPAPRMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>      PCDIMMDevice *dimm = PC_DIMM(dev);
>      Error *local_err = NULL;
>      uint64_t size;
> @@ -3328,10 +3496,30 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>          return;
>      }
>  
> -    if (size % SPAPR_MEMORY_BLOCK_SIZE) {
> +    if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) {
>          error_setg(errp, "Hotplugged memory size must be a multiple of "
> -                      "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
> +                          "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
>          return;
> +    } else if (is_nvdimm) {
> +        NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
> +        if ((nvdimm->label_size + size) % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
> +            error_setg(errp, "NVDIMM memory size must be a multiple of "
> +                       "%" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
> +            return;
> +        }
> +        if (((nvdimm->label_size + size) / SPAPR_MINIMUM_SCM_BLOCK_SIZE) == 1) {
> +            error_setg(errp, "NVDIMM size must be atleast "
> +                       "%" PRIu64 "MB", 2 * SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
> +            return;
> +        }
> +
> +        /* Align to scm block size, exclude the label */
> +        memory_device_set_region_size(MEMORY_DEVICE(nvdimm),
> +               QEMU_ALIGN_DOWN(size, SPAPR_MINIMUM_SCM_BLOCK_SIZE), &local_err);
I'm not sure that arbitrarily fixing up region size is the right thing to do
and also what you are trying to achieve here isn't clear, could you explain it some more?

> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
>      }
>  
>      memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> index 2edb7d1e9c..94ddd102cc 100644
> --- a/hw/ppc/spapr_drc.c
> +++ b/hw/ppc/spapr_drc.c
> @@ -696,6 +696,16 @@ static void spapr_drc_lmb_class_init(ObjectClass *k, void *data)
>      drck->release = spapr_lmb_release;
>  }
>  
> +static void spapr_drc_pmem_class_init(ObjectClass *k, void *data)
> +{
> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_CLASS(k);
> +
> +    drck->typeshift = SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM;
> +    drck->typename = "MEM";
> +    drck->drc_name_prefix = "PMEM ";
> +    drck->release = NULL;
> +}
> +
>  static const TypeInfo spapr_dr_connector_info = {
>      .name          = TYPE_SPAPR_DR_CONNECTOR,
>      .parent        = TYPE_DEVICE,
> @@ -739,6 +749,12 @@ static const TypeInfo spapr_drc_lmb_info = {
>      .class_init    = spapr_drc_lmb_class_init,
>  };
>  
> +static const TypeInfo spapr_drc_pmem_info = {
> +    .name          = TYPE_SPAPR_DRC_PMEM,
> +    .parent        = TYPE_SPAPR_DRC_LOGICAL,
> +    .class_init    = spapr_drc_pmem_class_init,
> +};
> +
>  /* helper functions for external users */
>  
>  sPAPRDRConnector *spapr_drc_by_index(uint32_t index)
> @@ -1189,6 +1205,7 @@ static void spapr_drc_register_types(void)
>      type_register_static(&spapr_drc_cpu_info);
>      type_register_static(&spapr_drc_pci_info);
>      type_register_static(&spapr_drc_lmb_info);
> +    type_register_static(&spapr_drc_pmem_info);
>  
>      spapr_rtas_register(RTAS_SET_INDICATOR, "set-indicator",
>                          rtas_set_indicator);
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index 32719a1b72..a4fed84346 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -193,6 +193,7 @@ struct rtas_event_log_v6_hp {
>  #define RTAS_LOG_V6_HP_TYPE_SLOT                         3
>  #define RTAS_LOG_V6_HP_TYPE_PHB                          4
>  #define RTAS_LOG_V6_HP_TYPE_PCI                          5
> +#define RTAS_LOG_V6_HP_TYPE_PMEM                         6
>      uint8_t hotplug_action;
>  #define RTAS_LOG_V6_HP_ACTION_ADD                        1
>  #define RTAS_LOG_V6_HP_ACTION_REMOVE                     2
> @@ -526,6 +527,9 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
>      case SPAPR_DR_CONNECTOR_TYPE_CPU:
>          hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_CPU;
>          break;
> +    case SPAPR_DR_CONNECTOR_TYPE_PMEM:
> +        hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PMEM;
> +        break;
>      default:
>          /* we shouldn't be signaling hotplug events for resources
>           * that don't support them
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index a947a0a0dc..21a9709afe 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -187,6 +187,7 @@ struct sPAPRMachineState {
>  
>      bool cmd_line_caps[SPAPR_CAP_NUM];
>      sPAPRCapabilities def, eff, mig;
> +    bool nvdimm_enabled;
>  };
>  
>  #define H_SUCCESS         0
> @@ -798,6 +799,15 @@ int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
>  #define SPAPR_LMB_FLAGS_DRC_INVALID 0x00000020
>  #define SPAPR_LMB_FLAGS_RESERVED 0x00000080
>  
> +/*
> + * The nvdimm size should be aligned to SCM block size.
> + * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
> + * inorder to have SCM regions not to overlap with dimm memory regions.
> + * The SCM devices can have variable block sizes. For now, fixing the
> + * block size to the minimum value.
> + */
> +#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
> +
>  void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg);
>  
>  #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
> diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
> index f6ff32e7e2..65925d00b1 100644
> --- a/include/hw/ppc/spapr_drc.h
> +++ b/include/hw/ppc/spapr_drc.h
> @@ -70,6 +70,13 @@
>  #define SPAPR_DRC_LMB(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
>                                          TYPE_SPAPR_DRC_LMB)
>  
> +#define TYPE_SPAPR_DRC_PMEM "spapr-drc-pmem"
> +#define SPAPR_DRC_PMEM_GET_CLASS(obj) \
> +        OBJECT_GET_CLASS(sPAPRDRConnectorClass, obj, TYPE_SPAPR_DRC_PMEM)
> +#define SPAPR_DRC_PMEM_CLASS(klass) \
> +        OBJECT_CLASS_CHECK(sPAPRDRConnectorClass, klass, TYPE_SPAPR_DRC_PMEM)
> +#define SPAPR_DRC_PMEM(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
> +                                        TYPE_SPAPR_DRC_PMEM)
>  /*
>   * Various hotplug types managed by sPAPRDRConnector
>   *
> @@ -87,6 +94,7 @@ typedef enum {
>      SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO = 3,
>      SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI = 4,
>      SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB = 8,
> +    SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM = 9,
>  } sPAPRDRConnectorTypeShift;
>  
>  typedef enum {
> @@ -96,6 +104,7 @@ typedef enum {
>      SPAPR_DR_CONNECTOR_TYPE_VIO = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO,
>      SPAPR_DR_CONNECTOR_TYPE_PCI = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI,
>      SPAPR_DR_CONNECTOR_TYPE_LMB = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB,
> +    SPAPR_DR_CONNECTOR_TYPE_PMEM = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM,
>  } sPAPRDRConnectorType;
>  
>  /*
> 
>
Shivaprasad G Bhat Feb. 19, 2019, 9:29 a.m. UTC | #6
On 02/19/2019 01:41 PM, Igor Mammedov wrote:
> On Tue, 05 Feb 2019 23:26:27 -0600
> Shivaprasad G Bhat <sbhat@linux.ibm.com> wrote:
>
>> Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
>> device interface in QEMU to support virtual NVDIMM devices for Power (May have
>> to re-look at this later).  Create the required DT entries for the
>> device (some entries have dummy values right now).
>>
>> The patch creates the required DT node and sends a hotplug
>> interrupt to the guest. Guest is expected to undertake the normal
>> DR resource add path in response and start issuing PAPR SCM hcalls.
>>
>> This is how it can be used ..
>> Add nvdimm=on to the qemu machine argument.
>> Ex : -machine pseries,nvdimm=on
>> For coldplug, the device to be added in qemu command line as shown below
>> -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
>> -device nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
>>
>> For hotplug, the device to be added from monitor as below
>> object_add memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
>> device_add nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
>>
>> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
>> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
>>                 [Early implementation]
>> ---
>>   default-configs/ppc64-softmmu.mak |    1
>>   hw/ppc/spapr.c                    |  212 +++++++++++++++++++++++++++++++++++--
>>   hw/ppc/spapr_drc.c                |   17 +++
>>   hw/ppc/spapr_events.c             |    4 +
>>   include/hw/ppc/spapr.h            |   10 ++
>>   include/hw/ppc/spapr_drc.h        |    9 ++
>>   6 files changed, 241 insertions(+), 12 deletions(-)
>>
>> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
>> index 7f34ad0528..b6e1aa5125 100644
>> --- a/default-configs/ppc64-softmmu.mak
>> +++ b/default-configs/ppc64-softmmu.mak
>> @@ -20,4 +20,5 @@ CONFIG_XIVE=$(CONFIG_PSERIES)
>>   CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
>>   CONFIG_MEM_DEVICE=y
>>   CONFIG_DIMM=y
>> +CONFIG_NVDIMM=y
>>   CONFIG_SPAPR_RNG=y
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 0fcdd35cbe..7e7a1a8041 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -73,6 +73,7 @@
>>   #include "qemu/cutils.h"
>>   #include "hw/ppc/spapr_cpu_core.h"
>>   #include "hw/mem/memory-device.h"
>> +#include "hw/mem/nvdimm.h"
>>   
>>   #include <libfdt.h>
>>   
>> @@ -690,6 +691,7 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>>       uint8_t *int_buf, *cur_index, buf_len;
>>       int ret;
>>       uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
>> +    uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
>>       uint64_t addr, cur_addr, size;
>>       uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
>>       uint64_t mem_end = machine->device_memory->base +
>> @@ -726,15 +728,24 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>>               nr_entries++;
>>           }
>>   
>> -        /* Entry for DIMM */
>> -        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
>> -        g_assert(drc);
>> -        elem = spapr_get_drconf_cell(size / lmb_size, addr,
>> -                                     spapr_drc_index(drc), node,
>> -                                     SPAPR_LMB_FLAGS_ASSIGNED);
>> +        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
>> +            /* Entry for NVDIMM */
>> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, addr / scm_block_size);
>> +            g_assert(drc);
>> +            elem = spapr_get_drconf_cell(size / scm_block_size, addr,
>> +                                         spapr_drc_index(drc), -1, 0);
>> +            cur_addr = ROUND_UP(addr + size, scm_block_size);
>> +        } else {
>> +            /* Entry for DIMM */
>> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
>> +            g_assert(drc);
>> +            elem = spapr_get_drconf_cell(size / lmb_size, addr,
>> +                                         spapr_drc_index(drc), node,
>> +                                         SPAPR_LMB_FLAGS_ASSIGNED);
>> +            cur_addr = addr + size;
>> +        }
>>           QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
>>           nr_entries++;
>> -        cur_addr = addr + size;
>>       }
>>   
>>       /* Entry for remaining hotpluggable area */
>> @@ -1225,6 +1236,42 @@ static void spapr_dt_hypervisor(sPAPRMachineState *spapr, void *fdt)
>>       }
>>   }
>>   
>> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset,
>> +                                      uint32_t node, uint64_t addr,
>> +                                      uint64_t size, uint64_t label_size);
>> +static void spapr_create_nvdimm(void *fdt)
>> +{
>> +    int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
>> +    GSList *dimms = NULL;
>> +
>> +    if (offset < 0) {
>> +        offset = fdt_add_subnode(fdt, 0, "persistent-memory");
>> +        _FDT(offset);
>> +        _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x2)));
>> +        _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
>> +        _FDT((fdt_setprop_string(fdt, offset, "name", "persistent-memory")));
>> +        _FDT((fdt_setprop_string(fdt, offset, "device_type",
>> +                                 "ibm,persistent-memory")));
>> +    }
>> +
>> +    /*NB : Add drc-info array here */
>> +
>> +    /* Create DT entries for cold plugged NVDIMM devices */
>> +    dimms = nvdimm_get_device_list();
>> +    for (; dimms; dimms = dimms->next) {
>> +        NVDIMMDevice *nvdimm = dimms->data;
>> +        PCDIMMDevice *di = PC_DIMM(nvdimm);
>> +        uint64_t lsize = nvdimm->label_size;
>> +        int size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
>> +                                           NULL);
>> +
>> +        spapr_populate_nvdimm_node(fdt, offset, di->node, di->addr,
>> +                                   size, lsize);
>> +    }
>> +    g_slist_free(dimms);
>> +    return;
>> +}
>> +
>>   static void *spapr_build_fdt(sPAPRMachineState *spapr)
>>   {
>>       MachineState *machine = MACHINE(spapr);
>> @@ -1348,6 +1395,11 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr)
>>           exit(1);
>>       }
>>   
>> +    /* NVDIMM devices */
>> +    if (spapr->nvdimm_enabled) {
>> +        spapr_create_nvdimm(fdt);
>> +    }
>> +
>>       return fdt;
>>   }
>>   
>> @@ -3143,6 +3195,20 @@ static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
>>       }
>>   }
>>   
>> +static bool spapr_get_nvdimm(Object *obj, Error **errp)
>> +{
>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>> +
>> +    return spapr->nvdimm_enabled;
>> +}
>> +
>> +static void spapr_set_nvdimm(Object *obj, bool value, Error **errp)
>> +{
>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>> +
>> +    spapr->nvdimm_enabled = value;
>> +}
>> +
>>   static void spapr_instance_init(Object *obj)
>>   {
>>       sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>> @@ -3188,6 +3254,11 @@ static void spapr_instance_init(Object *obj)
>>       object_property_set_description(obj, "ic-mode",
>>                    "Specifies the interrupt controller mode (xics, xive, dual)",
>>                    NULL);
>> +    object_property_add_bool(obj, "nvdimm",
>> +                            spapr_get_nvdimm, spapr_set_nvdimm, NULL);
>> +    object_property_set_description(obj, "nvdimm",
>> +                                    "Enable support for nvdimm devices",
>> +                                    NULL);
>>   }
>>   
>>   static void spapr_machine_finalizefn(Object *obj)
>> @@ -3267,12 +3338,103 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
>>       }
>>   }
>>   
>> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset, uint32_t node,
>> +                                      uint64_t addr, uint64_t size,
>> +                                      uint64_t label_size)
>> +{
>> +    int offset;
>> +    char buf[40];
>> +    GString *lcode = g_string_sized_new(10);
>> +    sPAPRDRConnector *drc;
>> +    QemuUUID uuid;
>> +    uint32_t drc_idx;
>> +    uint32_t associativity[] = {
>> +        cpu_to_be32(0x4), /* length */
>> +        cpu_to_be32(0x0), cpu_to_be32(0x0),
>> +        cpu_to_be32(0x0), cpu_to_be32(node)
>> +    };
>> +
>> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
>> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>> +    g_assert(drc);
>> +
>> +    drc_idx = spapr_drc_index(drc);
>> +
>> +    sprintf(buf, "pmem@%x", drc_idx);
>> +    offset = fdt_add_subnode(fdt, fdt_offset, buf);
>> +    _FDT(offset);
>> +
>> +    _FDT((fdt_setprop_cell(fdt, offset, "reg", drc_idx)));
>> +    _FDT((fdt_setprop_string(fdt, offset, "compatible", "ibm,pmemory")));
>> +    _FDT((fdt_setprop_string(fdt, offset, "name", "pmem")));
>> +    _FDT((fdt_setprop_string(fdt, offset, "device_type", "ibm,pmemory")));
>> +
>> +    /*NB : Supposed to be random strings. Currently empty 10 strings! */
>> +    _FDT((fdt_setprop(fdt, offset, "ibm,loc-code", lcode->str, lcode->len)));
>> +    g_string_free(lcode, TRUE);
>> +
>> +    _FDT((fdt_setprop(fdt, offset, "ibm,associativity", associativity,
>> +                      sizeof(associativity))));
>> +    g_random_set_seed(drc_idx);
>> +    qemu_uuid_generate(&uuid);
>> +
>> +    qemu_uuid_unparse(&uuid, buf);
>> +    _FDT((fdt_setprop_string(fdt, offset, "ibm,unit-guid", buf)));
>> +
>> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_idx)));
>> +
>> +    /*NB : What it should be? */
>> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,latency-attribute", 828));
>> +
>> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,block-size",
>> +                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
>> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,number-of-blocks",
>> +                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
>> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,metadata-size", label_size)));
>> +
>> +    return offset;
>> +}
>> +
>> +static void spapr_add_nvdimm(DeviceState *dev, uint64_t addr,
>> +                             uint64_t size, uint32_t node,
>> +                             Error **errp)
>> +{
>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_hotplug_handler(dev));
>> +    sPAPRDRConnector *drc;
>> +    bool hotplugged = spapr_drc_hotplugged(dev);
>> +    NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
>> +    void *fdt;
>> +    int fdt_offset, fdt_size;
>> +    Error *local_err = NULL;
>> +
>> +    spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM,
>> +                           addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
>> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>> +    g_assert(drc);
>> +
>> +    fdt = create_device_tree(&fdt_size);
>> +    fdt_offset = spapr_populate_nvdimm_node(fdt, 0, node, addr,
>> +                                            size, nvdimm->label_size);
>> +
>> +    spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +
>> +    if (hotplugged) {
>> +        spapr_hotplug_req_add_by_index(drc);
>> +    }
>> +}
>> +
>>   static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>                                 Error **errp)
>>   {
>>       Error *local_err = NULL;
>>       sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
>>       PCDIMMDevice *dimm = PC_DIMM(dev);
>> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>>       uint64_t size, addr;
>>       uint32_t node;
>>   
>> @@ -3291,9 +3453,14 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>   
>>       node = object_property_get_uint(OBJECT(dev), PC_DIMM_NODE_PROP,
>>                                       &error_abort);
>> -    spapr_add_lmbs(dev, addr, size, node,
>> -                   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
>> -                   &local_err);
>> +    if (!is_nvdimm) {
>> +        spapr_add_lmbs(dev, addr, size, node,
>> +                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
>> +                       &local_err);
>> +    } else {
>> +        spapr_add_nvdimm(dev, addr, size, node, &local_err);
>> +    }
>> +
>>       if (local_err) {
>>           goto out_unplug;
>>       }
>> @@ -3311,6 +3478,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>   {
>>       const sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
>>       sPAPRMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
>> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>>       PCDIMMDevice *dimm = PC_DIMM(dev);
>>       Error *local_err = NULL;
>>       uint64_t size;
>> @@ -3328,10 +3496,30 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>           return;
>>       }
>>   
>> -    if (size % SPAPR_MEMORY_BLOCK_SIZE) {
>> +    if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) {
>>           error_setg(errp, "Hotplugged memory size must be a multiple of "
>> -                      "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
>> +                          "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
>>           return;
>> +    } else if (is_nvdimm) {
>> +        NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
>> +        if ((nvdimm->label_size + size) % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
>> +            error_setg(errp, "NVDIMM memory size must be a multiple of "
>> +                       "%" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
>> +            return;
>> +        }
>> +        if (((nvdimm->label_size + size) / SPAPR_MINIMUM_SCM_BLOCK_SIZE) == 1) {
>> +            error_setg(errp, "NVDIMM size must be atleast "
>> +                       "%" PRIu64 "MB", 2 * SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
>> +            return;
>> +        }
>> +
>> +        /* Align to scm block size, exclude the label */
>> +        memory_device_set_region_size(MEMORY_DEVICE(nvdimm),
>> +               QEMU_ALIGN_DOWN(size, SPAPR_MINIMUM_SCM_BLOCK_SIZE), &local_err);
> I'm not sure that arbitrarily fixing up region size is the right thing to do
> and also what you are trying to achieve here isn't clear, could you explain it some more?
The resize is required to allow the subsequent memory hotplugs to work. The
base address(if not specified) for the next dimm hotplug, starts at the 
end of
this region. If the region is not aligned to LMB size, guest refuses to 
claim the
newly hotplugged memory.  The label area can be small and need not be
aligned to (LMB/SCM block) size. The region size is actually the size 
minus the
label_size which can be unaligned to LMB size. So, align down to SCM block
size is necessary here.
>
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>>       }
>>   
>>       memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
>> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
>> index 2edb7d1e9c..94ddd102cc 100644
>> --- a/hw/ppc/spapr_drc.c
>> +++ b/hw/ppc/spapr_drc.c
>> @@ -696,6 +696,16 @@ static void spapr_drc_lmb_class_init(ObjectClass *k, void *data)
>>       drck->release = spapr_lmb_release;
>>   }
>>   
>> +static void spapr_drc_pmem_class_init(ObjectClass *k, void *data)
>> +{
>> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_CLASS(k);
>> +
>> +    drck->typeshift = SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM;
>> +    drck->typename = "MEM";
>> +    drck->drc_name_prefix = "PMEM ";
>> +    drck->release = NULL;
>> +}
>> +
>>   static const TypeInfo spapr_dr_connector_info = {
>>       .name          = TYPE_SPAPR_DR_CONNECTOR,
>>       .parent        = TYPE_DEVICE,
>> @@ -739,6 +749,12 @@ static const TypeInfo spapr_drc_lmb_info = {
>>       .class_init    = spapr_drc_lmb_class_init,
>>   };
>>   
>> +static const TypeInfo spapr_drc_pmem_info = {
>> +    .name          = TYPE_SPAPR_DRC_PMEM,
>> +    .parent        = TYPE_SPAPR_DRC_LOGICAL,
>> +    .class_init    = spapr_drc_pmem_class_init,
>> +};
>> +
>>   /* helper functions for external users */
>>   
>>   sPAPRDRConnector *spapr_drc_by_index(uint32_t index)
>> @@ -1189,6 +1205,7 @@ static void spapr_drc_register_types(void)
>>       type_register_static(&spapr_drc_cpu_info);
>>       type_register_static(&spapr_drc_pci_info);
>>       type_register_static(&spapr_drc_lmb_info);
>> +    type_register_static(&spapr_drc_pmem_info);
>>   
>>       spapr_rtas_register(RTAS_SET_INDICATOR, "set-indicator",
>>                           rtas_set_indicator);
>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>> index 32719a1b72..a4fed84346 100644
>> --- a/hw/ppc/spapr_events.c
>> +++ b/hw/ppc/spapr_events.c
>> @@ -193,6 +193,7 @@ struct rtas_event_log_v6_hp {
>>   #define RTAS_LOG_V6_HP_TYPE_SLOT                         3
>>   #define RTAS_LOG_V6_HP_TYPE_PHB                          4
>>   #define RTAS_LOG_V6_HP_TYPE_PCI                          5
>> +#define RTAS_LOG_V6_HP_TYPE_PMEM                         6
>>       uint8_t hotplug_action;
>>   #define RTAS_LOG_V6_HP_ACTION_ADD                        1
>>   #define RTAS_LOG_V6_HP_ACTION_REMOVE                     2
>> @@ -526,6 +527,9 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
>>       case SPAPR_DR_CONNECTOR_TYPE_CPU:
>>           hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_CPU;
>>           break;
>> +    case SPAPR_DR_CONNECTOR_TYPE_PMEM:
>> +        hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PMEM;
>> +        break;
>>       default:
>>           /* we shouldn't be signaling hotplug events for resources
>>            * that don't support them
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index a947a0a0dc..21a9709afe 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -187,6 +187,7 @@ struct sPAPRMachineState {
>>   
>>       bool cmd_line_caps[SPAPR_CAP_NUM];
>>       sPAPRCapabilities def, eff, mig;
>> +    bool nvdimm_enabled;
>>   };
>>   
>>   #define H_SUCCESS         0
>> @@ -798,6 +799,15 @@ int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
>>   #define SPAPR_LMB_FLAGS_DRC_INVALID 0x00000020
>>   #define SPAPR_LMB_FLAGS_RESERVED 0x00000080
>>   
>> +/*
>> + * The nvdimm size should be aligned to SCM block size.
>> + * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
>> + * inorder to have SCM regions not to overlap with dimm memory regions.
>> + * The SCM devices can have variable block sizes. For now, fixing the
>> + * block size to the minimum value.
>> + */
>> +#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
>> +
>>   void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg);
>>   
>>   #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
>> diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
>> index f6ff32e7e2..65925d00b1 100644
>> --- a/include/hw/ppc/spapr_drc.h
>> +++ b/include/hw/ppc/spapr_drc.h
>> @@ -70,6 +70,13 @@
>>   #define SPAPR_DRC_LMB(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
>>                                           TYPE_SPAPR_DRC_LMB)
>>   
>> +#define TYPE_SPAPR_DRC_PMEM "spapr-drc-pmem"
>> +#define SPAPR_DRC_PMEM_GET_CLASS(obj) \
>> +        OBJECT_GET_CLASS(sPAPRDRConnectorClass, obj, TYPE_SPAPR_DRC_PMEM)
>> +#define SPAPR_DRC_PMEM_CLASS(klass) \
>> +        OBJECT_CLASS_CHECK(sPAPRDRConnectorClass, klass, TYPE_SPAPR_DRC_PMEM)
>> +#define SPAPR_DRC_PMEM(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
>> +                                        TYPE_SPAPR_DRC_PMEM)
>>   /*
>>    * Various hotplug types managed by sPAPRDRConnector
>>    *
>> @@ -87,6 +94,7 @@ typedef enum {
>>       SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO = 3,
>>       SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI = 4,
>>       SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB = 8,
>> +    SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM = 9,
>>   } sPAPRDRConnectorTypeShift;
>>   
>>   typedef enum {
>> @@ -96,6 +104,7 @@ typedef enum {
>>       SPAPR_DR_CONNECTOR_TYPE_VIO = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO,
>>       SPAPR_DR_CONNECTOR_TYPE_PCI = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI,
>>       SPAPR_DR_CONNECTOR_TYPE_LMB = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB,
>> +    SPAPR_DR_CONNECTOR_TYPE_PMEM = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM,
>>   } sPAPRDRConnectorType;
>>   
>>   /*
>>
>>
Igor Mammedov Feb. 21, 2019, 2:12 p.m. UTC | #7
On Tue, 19 Feb 2019 14:59:25 +0530
Shivaprasad G Bhat <sbhat@linux.ibm.com> wrote:

> On 02/19/2019 01:41 PM, Igor Mammedov wrote:
> > On Tue, 05 Feb 2019 23:26:27 -0600
> > Shivaprasad G Bhat <sbhat@linux.ibm.com> wrote:
> >  
> >> Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
> >> device interface in QEMU to support virtual NVDIMM devices for Power (May have
> >> to re-look at this later).  Create the required DT entries for the
> >> device (some entries have dummy values right now).
> >>
> >> The patch creates the required DT node and sends a hotplug
> >> interrupt to the guest. Guest is expected to undertake the normal
> >> DR resource add path in response and start issuing PAPR SCM hcalls.
> >>
> >> This is how it can be used ..
> >> Add nvdimm=on to the qemu machine argument.
> >> Ex : -machine pseries,nvdimm=on
> >> For coldplug, the device to be added in qemu command line as shown below
> >> -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
> >> -device nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
> >>
> >> For hotplug, the device to be added from monitor as below
> >> object_add memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
> >> device_add nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
> >>
> >> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
> >> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> >>                 [Early implementation]
> >> ---
> >>   default-configs/ppc64-softmmu.mak |    1
> >>   hw/ppc/spapr.c                    |  212 +++++++++++++++++++++++++++++++++++--
> >>   hw/ppc/spapr_drc.c                |   17 +++
> >>   hw/ppc/spapr_events.c             |    4 +
> >>   include/hw/ppc/spapr.h            |   10 ++
> >>   include/hw/ppc/spapr_drc.h        |    9 ++
> >>   6 files changed, 241 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> >> index 7f34ad0528..b6e1aa5125 100644
> >> --- a/default-configs/ppc64-softmmu.mak
> >> +++ b/default-configs/ppc64-softmmu.mak
> >> @@ -20,4 +20,5 @@ CONFIG_XIVE=$(CONFIG_PSERIES)
> >>   CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
> >>   CONFIG_MEM_DEVICE=y
> >>   CONFIG_DIMM=y
> >> +CONFIG_NVDIMM=y
> >>   CONFIG_SPAPR_RNG=y
> >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >> index 0fcdd35cbe..7e7a1a8041 100644
> >> --- a/hw/ppc/spapr.c
> >> +++ b/hw/ppc/spapr.c
> >> @@ -73,6 +73,7 @@
> >>   #include "qemu/cutils.h"
> >>   #include "hw/ppc/spapr_cpu_core.h"
> >>   #include "hw/mem/memory-device.h"
> >> +#include "hw/mem/nvdimm.h"
> >>   
> >>   #include <libfdt.h>
> >>   
> >> @@ -690,6 +691,7 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
> >>       uint8_t *int_buf, *cur_index, buf_len;
> >>       int ret;
> >>       uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> >> +    uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> >>       uint64_t addr, cur_addr, size;
> >>       uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
> >>       uint64_t mem_end = machine->device_memory->base +
> >> @@ -726,15 +728,24 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
> >>               nr_entries++;
> >>           }
> >>   
> >> -        /* Entry for DIMM */
> >> -        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> >> -        g_assert(drc);
> >> -        elem = spapr_get_drconf_cell(size / lmb_size, addr,
> >> -                                     spapr_drc_index(drc), node,
> >> -                                     SPAPR_LMB_FLAGS_ASSIGNED);
> >> +        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
> >> +            /* Entry for NVDIMM */
> >> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, addr / scm_block_size);
> >> +            g_assert(drc);
> >> +            elem = spapr_get_drconf_cell(size / scm_block_size, addr,
> >> +                                         spapr_drc_index(drc), -1, 0);
> >> +            cur_addr = ROUND_UP(addr + size, scm_block_size);
> >> +        } else {
> >> +            /* Entry for DIMM */
> >> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> >> +            g_assert(drc);
> >> +            elem = spapr_get_drconf_cell(size / lmb_size, addr,
> >> +                                         spapr_drc_index(drc), node,
> >> +                                         SPAPR_LMB_FLAGS_ASSIGNED);
> >> +            cur_addr = addr + size;
> >> +        }
> >>           QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> >>           nr_entries++;
> >> -        cur_addr = addr + size;
> >>       }
> >>   
> >>       /* Entry for remaining hotpluggable area */
> >> @@ -1225,6 +1236,42 @@ static void spapr_dt_hypervisor(sPAPRMachineState *spapr, void *fdt)
> >>       }
> >>   }
> >>   
> >> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset,
> >> +                                      uint32_t node, uint64_t addr,
> >> +                                      uint64_t size, uint64_t label_size);
> >> +static void spapr_create_nvdimm(void *fdt)
> >> +{
> >> +    int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
> >> +    GSList *dimms = NULL;
> >> +
> >> +    if (offset < 0) {
> >> +        offset = fdt_add_subnode(fdt, 0, "persistent-memory");
> >> +        _FDT(offset);
> >> +        _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x2)));
> >> +        _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
> >> +        _FDT((fdt_setprop_string(fdt, offset, "name", "persistent-memory")));
> >> +        _FDT((fdt_setprop_string(fdt, offset, "device_type",
> >> +                                 "ibm,persistent-memory")));
> >> +    }
> >> +
> >> +    /*NB : Add drc-info array here */
> >> +
> >> +    /* Create DT entries for cold plugged NVDIMM devices */
> >> +    dimms = nvdimm_get_device_list();
> >> +    for (; dimms; dimms = dimms->next) {
> >> +        NVDIMMDevice *nvdimm = dimms->data;
> >> +        PCDIMMDevice *di = PC_DIMM(nvdimm);
> >> +        uint64_t lsize = nvdimm->label_size;
> >> +        int size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
> >> +                                           NULL);
> >> +
> >> +        spapr_populate_nvdimm_node(fdt, offset, di->node, di->addr,
> >> +                                   size, lsize);
> >> +    }
> >> +    g_slist_free(dimms);
> >> +    return;
> >> +}
> >> +
> >>   static void *spapr_build_fdt(sPAPRMachineState *spapr)
> >>   {
> >>       MachineState *machine = MACHINE(spapr);
> >> @@ -1348,6 +1395,11 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr)
> >>           exit(1);
> >>       }
> >>   
> >> +    /* NVDIMM devices */
> >> +    if (spapr->nvdimm_enabled) {
> >> +        spapr_create_nvdimm(fdt);
> >> +    }
> >> +
> >>       return fdt;
> >>   }
> >>   
> >> @@ -3143,6 +3195,20 @@ static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
> >>       }
> >>   }
> >>   
> >> +static bool spapr_get_nvdimm(Object *obj, Error **errp)
> >> +{
> >> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> >> +
> >> +    return spapr->nvdimm_enabled;
> >> +}
> >> +
> >> +static void spapr_set_nvdimm(Object *obj, bool value, Error **errp)
> >> +{
> >> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> >> +
> >> +    spapr->nvdimm_enabled = value;
> >> +}
> >> +
> >>   static void spapr_instance_init(Object *obj)
> >>   {
> >>       sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> >> @@ -3188,6 +3254,11 @@ static void spapr_instance_init(Object *obj)
> >>       object_property_set_description(obj, "ic-mode",
> >>                    "Specifies the interrupt controller mode (xics, xive, dual)",
> >>                    NULL);
> >> +    object_property_add_bool(obj, "nvdimm",
> >> +                            spapr_get_nvdimm, spapr_set_nvdimm, NULL);
> >> +    object_property_set_description(obj, "nvdimm",
> >> +                                    "Enable support for nvdimm devices",
> >> +                                    NULL);
> >>   }
> >>   
> >>   static void spapr_machine_finalizefn(Object *obj)
> >> @@ -3267,12 +3338,103 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
> >>       }
> >>   }
> >>   
> >> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset, uint32_t node,
> >> +                                      uint64_t addr, uint64_t size,
> >> +                                      uint64_t label_size)
> >> +{
> >> +    int offset;
> >> +    char buf[40];
> >> +    GString *lcode = g_string_sized_new(10);
> >> +    sPAPRDRConnector *drc;
> >> +    QemuUUID uuid;
> >> +    uint32_t drc_idx;
> >> +    uint32_t associativity[] = {
> >> +        cpu_to_be32(0x4), /* length */
> >> +        cpu_to_be32(0x0), cpu_to_be32(0x0),
> >> +        cpu_to_be32(0x0), cpu_to_be32(node)
> >> +    };
> >> +
> >> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
> >> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> >> +    g_assert(drc);
> >> +
> >> +    drc_idx = spapr_drc_index(drc);
> >> +
> >> +    sprintf(buf, "pmem@%x", drc_idx);
> >> +    offset = fdt_add_subnode(fdt, fdt_offset, buf);
> >> +    _FDT(offset);
> >> +
> >> +    _FDT((fdt_setprop_cell(fdt, offset, "reg", drc_idx)));
> >> +    _FDT((fdt_setprop_string(fdt, offset, "compatible", "ibm,pmemory")));
> >> +    _FDT((fdt_setprop_string(fdt, offset, "name", "pmem")));
> >> +    _FDT((fdt_setprop_string(fdt, offset, "device_type", "ibm,pmemory")));
> >> +
> >> +    /*NB : Supposed to be random strings. Currently empty 10 strings! */
> >> +    _FDT((fdt_setprop(fdt, offset, "ibm,loc-code", lcode->str, lcode->len)));
> >> +    g_string_free(lcode, TRUE);
> >> +
> >> +    _FDT((fdt_setprop(fdt, offset, "ibm,associativity", associativity,
> >> +                      sizeof(associativity))));
> >> +    g_random_set_seed(drc_idx);
> >> +    qemu_uuid_generate(&uuid);
> >> +
> >> +    qemu_uuid_unparse(&uuid, buf);
> >> +    _FDT((fdt_setprop_string(fdt, offset, "ibm,unit-guid", buf)));
> >> +
> >> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_idx)));
> >> +
> >> +    /*NB : What it should be? */
> >> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,latency-attribute", 828));
> >> +
> >> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,block-size",
> >> +                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> >> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,number-of-blocks",
> >> +                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> >> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,metadata-size", label_size)));
> >> +
> >> +    return offset;
> >> +}
> >> +
> >> +static void spapr_add_nvdimm(DeviceState *dev, uint64_t addr,
> >> +                             uint64_t size, uint32_t node,
> >> +                             Error **errp)
> >> +{
> >> +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_hotplug_handler(dev));
> >> +    sPAPRDRConnector *drc;
> >> +    bool hotplugged = spapr_drc_hotplugged(dev);
> >> +    NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
> >> +    void *fdt;
> >> +    int fdt_offset, fdt_size;
> >> +    Error *local_err = NULL;
> >> +
> >> +    spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM,
> >> +                           addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> >> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
> >> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> >> +    g_assert(drc);
> >> +
> >> +    fdt = create_device_tree(&fdt_size);
> >> +    fdt_offset = spapr_populate_nvdimm_node(fdt, 0, node, addr,
> >> +                                            size, nvdimm->label_size);
> >> +
> >> +    spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
> >> +    if (local_err) {
> >> +        error_propagate(errp, local_err);
> >> +        return;
> >> +    }
> >> +
> >> +    if (hotplugged) {
> >> +        spapr_hotplug_req_add_by_index(drc);
> >> +    }
> >> +}
> >> +
> >>   static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> >>                                 Error **errp)
> >>   {
> >>       Error *local_err = NULL;
> >>       sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
> >>       PCDIMMDevice *dimm = PC_DIMM(dev);
> >> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
> >>       uint64_t size, addr;
> >>       uint32_t node;
> >>   
> >> @@ -3291,9 +3453,14 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> >>   
> >>       node = object_property_get_uint(OBJECT(dev), PC_DIMM_NODE_PROP,
> >>                                       &error_abort);
> >> -    spapr_add_lmbs(dev, addr, size, node,
> >> -                   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> >> -                   &local_err);
> >> +    if (!is_nvdimm) {
> >> +        spapr_add_lmbs(dev, addr, size, node,
> >> +                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> >> +                       &local_err);
> >> +    } else {
> >> +        spapr_add_nvdimm(dev, addr, size, node, &local_err);
> >> +    }
> >> +
> >>       if (local_err) {
> >>           goto out_unplug;
> >>       }
> >> @@ -3311,6 +3478,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> >>   {
> >>       const sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
> >>       sPAPRMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
> >> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
> >>       PCDIMMDevice *dimm = PC_DIMM(dev);
> >>       Error *local_err = NULL;
> >>       uint64_t size;
> >> @@ -3328,10 +3496,30 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> >>           return;
> >>       }
> >>   
> >> -    if (size % SPAPR_MEMORY_BLOCK_SIZE) {
> >> +    if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) {
> >>           error_setg(errp, "Hotplugged memory size must be a multiple of "
> >> -                      "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
> >> +                          "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
> >>           return;
> >> +    } else if (is_nvdimm) {
> >> +        NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
> >> +        if ((nvdimm->label_size + size) % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
> >> +            error_setg(errp, "NVDIMM memory size must be a multiple of "
> >> +                       "%" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
> >> +            return;
> >> +        }
> >> +        if (((nvdimm->label_size + size) / SPAPR_MINIMUM_SCM_BLOCK_SIZE) == 1) {
> >> +            error_setg(errp, "NVDIMM size must be atleast "
> >> +                       "%" PRIu64 "MB", 2 * SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
> >> +            return;
> >> +        }
on the second glance 2 things looks weird here:
  1. we shouldn't poke inside of nvdimm object directly, there is NVDIMM_LABEL_SIZE_PROP
     if you really need to get label size
  2. why do we need to care about label_size here at all?

> >> +        /* Align to scm block size, exclude the label */
> >> +        memory_device_set_region_size(MEMORY_DEVICE(nvdimm),
> >> +               QEMU_ALIGN_DOWN(size, SPAPR_MINIMUM_SCM_BLOCK_SIZE), &local_err);  
> > I'm not sure that arbitrarily fixing up region size is the right thing to do
> > and also what you are trying to achieve here isn't clear, could you explain it some more?  
> The resize is required to allow the subsequent memory hotplugs to work. The
> base address(if not specified) for the next dimm hotplug, starts at the 
> end of
> this region. If the region is not aligned to LMB size, guest refuses to 
> claim the
> newly hotplugged memory.  The label area can be small and need not be
> aligned to (LMB/SCM block) size. The region size is actually the size 
> minus the
> label_size which can be unaligned to LMB size. So, align down to SCM block
> size is necessary here.
Well fixing up object(MemoryRegion) which belongs to the backend from
machine level to satisfy machine specific alignment requirements looks
like a wrong thing to do.

So we need to come up with another approach.
I'm sill not sure what problem is there but nvdimm already
has a notion of data region (without label size) look for
nvdimm->nvdimm_mr and mdc->get_memory_region and that's what you have in
local var 'size'. So what you are doing here look incorrect even more,
i.e. beside we shouldn't do it at all and the second thing is that you are
sizing down data area which already excludes label size.

What I'd suggest is to align up GPA of being added memory on
   MAX(LMB size, backend_page_size, max supported huge page size)
so hotplugged dimm or whatever else would be properly aligned,
see pc_dimm_pre_plug(,legacy_align,) and how PC uses it.

> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +        }
> >>       }
> >>   
> >>       memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
> >> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> >> index 2edb7d1e9c..94ddd102cc 100644
> >> --- a/hw/ppc/spapr_drc.c
> >> +++ b/hw/ppc/spapr_drc.c
> >> @@ -696,6 +696,16 @@ static void spapr_drc_lmb_class_init(ObjectClass *k, void *data)
> >>       drck->release = spapr_lmb_release;
> >>   }
> >>   
> >> +static void spapr_drc_pmem_class_init(ObjectClass *k, void *data)
> >> +{
> >> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_CLASS(k);
> >> +
> >> +    drck->typeshift = SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM;
> >> +    drck->typename = "MEM";
> >> +    drck->drc_name_prefix = "PMEM ";
> >> +    drck->release = NULL;
> >> +}
> >> +
> >>   static const TypeInfo spapr_dr_connector_info = {
> >>       .name          = TYPE_SPAPR_DR_CONNECTOR,
> >>       .parent        = TYPE_DEVICE,
> >> @@ -739,6 +749,12 @@ static const TypeInfo spapr_drc_lmb_info = {
> >>       .class_init    = spapr_drc_lmb_class_init,
> >>   };
> >>   
> >> +static const TypeInfo spapr_drc_pmem_info = {
> >> +    .name          = TYPE_SPAPR_DRC_PMEM,
> >> +    .parent        = TYPE_SPAPR_DRC_LOGICAL,
> >> +    .class_init    = spapr_drc_pmem_class_init,
> >> +};
> >> +
> >>   /* helper functions for external users */
> >>   
> >>   sPAPRDRConnector *spapr_drc_by_index(uint32_t index)
> >> @@ -1189,6 +1205,7 @@ static void spapr_drc_register_types(void)
> >>       type_register_static(&spapr_drc_cpu_info);
> >>       type_register_static(&spapr_drc_pci_info);
> >>       type_register_static(&spapr_drc_lmb_info);
> >> +    type_register_static(&spapr_drc_pmem_info);
> >>   
> >>       spapr_rtas_register(RTAS_SET_INDICATOR, "set-indicator",
> >>                           rtas_set_indicator);
> >> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> >> index 32719a1b72..a4fed84346 100644
> >> --- a/hw/ppc/spapr_events.c
> >> +++ b/hw/ppc/spapr_events.c
> >> @@ -193,6 +193,7 @@ struct rtas_event_log_v6_hp {
> >>   #define RTAS_LOG_V6_HP_TYPE_SLOT                         3
> >>   #define RTAS_LOG_V6_HP_TYPE_PHB                          4
> >>   #define RTAS_LOG_V6_HP_TYPE_PCI                          5
> >> +#define RTAS_LOG_V6_HP_TYPE_PMEM                         6
> >>       uint8_t hotplug_action;
> >>   #define RTAS_LOG_V6_HP_ACTION_ADD                        1
> >>   #define RTAS_LOG_V6_HP_ACTION_REMOVE                     2
> >> @@ -526,6 +527,9 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
> >>       case SPAPR_DR_CONNECTOR_TYPE_CPU:
> >>           hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_CPU;
> >>           break;
> >> +    case SPAPR_DR_CONNECTOR_TYPE_PMEM:
> >> +        hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PMEM;
> >> +        break;
> >>       default:
> >>           /* we shouldn't be signaling hotplug events for resources
> >>            * that don't support them
> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >> index a947a0a0dc..21a9709afe 100644
> >> --- a/include/hw/ppc/spapr.h
> >> +++ b/include/hw/ppc/spapr.h
> >> @@ -187,6 +187,7 @@ struct sPAPRMachineState {
> >>   
> >>       bool cmd_line_caps[SPAPR_CAP_NUM];
> >>       sPAPRCapabilities def, eff, mig;
> >> +    bool nvdimm_enabled;
> >>   };
> >>   
> >>   #define H_SUCCESS         0
> >> @@ -798,6 +799,15 @@ int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
> >>   #define SPAPR_LMB_FLAGS_DRC_INVALID 0x00000020
> >>   #define SPAPR_LMB_FLAGS_RESERVED 0x00000080
> >>   
> >> +/*
> >> + * The nvdimm size should be aligned to SCM block size.
> >> + * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
> >> + * inorder to have SCM regions not to overlap with dimm memory regions.
> >> + * The SCM devices can have variable block sizes. For now, fixing the
> >> + * block size to the minimum value.
> >> + */
> >> +#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
> >> +
> >>   void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg);
> >>   
> >>   #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
> >> diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
> >> index f6ff32e7e2..65925d00b1 100644
> >> --- a/include/hw/ppc/spapr_drc.h
> >> +++ b/include/hw/ppc/spapr_drc.h
> >> @@ -70,6 +70,13 @@
> >>   #define SPAPR_DRC_LMB(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
> >>                                           TYPE_SPAPR_DRC_LMB)
> >>   
> >> +#define TYPE_SPAPR_DRC_PMEM "spapr-drc-pmem"
> >> +#define SPAPR_DRC_PMEM_GET_CLASS(obj) \
> >> +        OBJECT_GET_CLASS(sPAPRDRConnectorClass, obj, TYPE_SPAPR_DRC_PMEM)
> >> +#define SPAPR_DRC_PMEM_CLASS(klass) \
> >> +        OBJECT_CLASS_CHECK(sPAPRDRConnectorClass, klass, TYPE_SPAPR_DRC_PMEM)
> >> +#define SPAPR_DRC_PMEM(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
> >> +                                        TYPE_SPAPR_DRC_PMEM)
> >>   /*
> >>    * Various hotplug types managed by sPAPRDRConnector
> >>    *
> >> @@ -87,6 +94,7 @@ typedef enum {
> >>       SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO = 3,
> >>       SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI = 4,
> >>       SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB = 8,
> >> +    SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM = 9,
> >>   } sPAPRDRConnectorTypeShift;
> >>   
> >>   typedef enum {
> >> @@ -96,6 +104,7 @@ typedef enum {
> >>       SPAPR_DR_CONNECTOR_TYPE_VIO = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO,
> >>       SPAPR_DR_CONNECTOR_TYPE_PCI = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI,
> >>       SPAPR_DR_CONNECTOR_TYPE_LMB = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB,
> >> +    SPAPR_DR_CONNECTOR_TYPE_PMEM = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM,
> >>   } sPAPRDRConnectorType;
> >>   
> >>   /*
> >>
> >>  
>
David Gibson Feb. 27, 2019, 4:27 a.m. UTC | #8
On Mon, Feb 18, 2019 at 09:45:13PM +0530, Shivaprasad G Bhat wrote:
> 
> 
> On 02/18/2019 04:32 AM, David Gibson wrote:
> > On Fri, Feb 15, 2019 at 04:41:09PM +0530, Shivaprasad G Bhat wrote:
> > > Thanks for the comments David. Please find my replies inline..
[snip]
> > > > > +
> > > > > +    qemu_uuid_unparse(&uuid, buf);
> > > > > +    _FDT((fdt_setprop_string(fdt, offset, "ibm,unit-guid", buf)));
> > > > > +
> > > > > +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_idx)));
> > > > > +
> > > > > +    /*NB : What it should be? */
> > > > > +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,latency-attribute", 828));
> > > > > +
> > > > > +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,block-size",
> > > > > +                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> > > > > +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,number-of-blocks",
> > > > > +                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> > > > > +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,metadata-size", label_size)));
> > > > > +
> > > > > +    return offset;
> > > > > +}
> > > > > +
> > > > > +static void spapr_add_nvdimm(DeviceState *dev, uint64_t addr,
> > > > > +                             uint64_t size, uint32_t node,
> > > > > +                             Error **errp)
> > > > > +{
> > > > > +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_hotplug_handler(dev));
> > > > > +    sPAPRDRConnector *drc;
> > > > > +    bool hotplugged = spapr_drc_hotplugged(dev);
> > > > > +    NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
> > > > > +    void *fdt;
> > > > > +    int fdt_offset, fdt_size;
> > > > > +    Error *local_err = NULL;
> > > > > +
> > > > > +    spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM,
> > > > > +                           addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> > > > > +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
> > > > > +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> > > > > +    g_assert(drc);
> > > > Creating the DRC in the hotplug path looks bogus.  Generally the DRC
> > > > has to exist before you can even attempt to plug the device.
> > > We dont really know how many DRC to create. Unlike memory hotplug
> > > where we know how many LMBs are required to fit till the maxmem, in this
> > > case we dont know how many NVDIMM devices  guest can have. That is the
> > > reason I am creating the DRC on demand. I'll see if it is possible to
> > > address this
> > > by putting a cap on maximum number of NVDIMM devices a guest can have.
> > Urgh, PAPR.  First it specifies a crappy hotplug model that requires
> > zillions of fixed attachment points to be instantiated, then it breaks
> > its own model.
> > 
> > But.. I still don't really understand how this works.
> > 
> > a) How does the guest know the DRC index to use for the new NVDIMM?
> >     Generally that comes from the device tree, but the guest doesn't
> >     get new device tree information until it calls configure-connector
> >     for which it needs the DRC index.
> The DRC is passed in the device tree blob passed as payload of hotplug
> interrupt

Um.. there is no device tree blob as paylod of a hotplug interrupt.
The guest only gets device tree information when it makes
configure-connector calls.

I see that there is a drc identifier field though, so I guess you're
getting the DRC from that.  In existing cases the guest looks that up
in the *existing* device tree to find infomation about that DRC.  I
guess in the case of NVDIMMs here it doesn't need any more info.

> from which the guest picks the DRC index and makes the subsequent calls.
> > b) AFAICT, NVDIMMs would also require HPT space, much like regular
> >     memory would.  PowerVM doesn't have HPT resizing, so surely it must
> >     already have some sort of cap on the amount of NVDIMM space in
> >     order to size the HPT correctly.
> On Power KVM we will enforce the NVDIMM is mapped within the maxmem,
> however the spec allows outside of it. Coming back to the original point of
> creating the DRCs at the hotplug time, we could impose a limit on the
> number of NVDIMM devices that could be hotplugged so that we can
> create the DRCs at the machine init time.

Ah, so NVDIMMs live within the same maxmem limit as regular memory.
Ok, I guess that makes sense.
Shivaprasad G Bhat Feb. 28, 2019, 8:54 a.m. UTC | #9
Hi Igor,

Thanks for the elaboration. Please find my response inline.


On 02/21/2019 07:42 PM, Igor Mammedov wrote:
> On Tue, 19 Feb 2019 14:59:25 +0530
> Shivaprasad G Bhat <sbhat@linux.ibm.com> wrote:
>
>> On 02/19/2019 01:41 PM, Igor Mammedov wrote:
>>> On Tue, 05 Feb 2019 23:26:27 -0600
>>> Shivaprasad G Bhat <sbhat@linux.ibm.com> wrote:
>>>   
>>>> Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
>>>> device interface in QEMU to support virtual NVDIMM devices for Power (May have
>>>> to re-look at this later).  Create the required DT entries for the
>>>> device (some entries have dummy values right now).
>>>>
>>>> The patch creates the required DT node and sends a hotplug
>>>> interrupt to the guest. Guest is expected to undertake the normal
>>>> DR resource add path in response and start issuing PAPR SCM hcalls.
>>>>
>>>> This is how it can be used ..
>>>> Add nvdimm=on to the qemu machine argument.
>>>> Ex : -machine pseries,nvdimm=on
>>>> For coldplug, the device to be added in qemu command line as shown below
>>>> -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
>>>> -device nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
>>>>
>>>> For hotplug, the device to be added from monitor as below
>>>> object_add memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
>>>> device_add nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
>>>>
>>>> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
>>>> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
>>>>                  [Early implementation]
>>>> ---
>>>>    default-configs/ppc64-softmmu.mak |    1
>>>>    hw/ppc/spapr.c                    |  212 +++++++++++++++++++++++++++++++++++--
>>>>    hw/ppc/spapr_drc.c                |   17 +++
>>>>    hw/ppc/spapr_events.c             |    4 +
>>>>    include/hw/ppc/spapr.h            |   10 ++
>>>>    include/hw/ppc/spapr_drc.h        |    9 ++
>>>>    6 files changed, 241 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
>>>> index 7f34ad0528..b6e1aa5125 100644
>>>> --- a/default-configs/ppc64-softmmu.mak
>>>> +++ b/default-configs/ppc64-softmmu.mak
>>>> @@ -20,4 +20,5 @@ CONFIG_XIVE=$(CONFIG_PSERIES)
>>>>    CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
>>>>    CONFIG_MEM_DEVICE=y
>>>>    CONFIG_DIMM=y
>>>> +CONFIG_NVDIMM=y
>>>>    CONFIG_SPAPR_RNG=y
>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>> index 0fcdd35cbe..7e7a1a8041 100644
>>>> --- a/hw/ppc/spapr.c
>>>> +++ b/hw/ppc/spapr.c
>>>> @@ -73,6 +73,7 @@
>>>>    #include "qemu/cutils.h"
>>>>    #include "hw/ppc/spapr_cpu_core.h"
>>>>    #include "hw/mem/memory-device.h"
>>>> +#include "hw/mem/nvdimm.h"
>>>>    
>>>>    #include <libfdt.h>
>>>>    
>>>> @@ -690,6 +691,7 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>>>>        uint8_t *int_buf, *cur_index, buf_len;
>>>>        int ret;
>>>>        uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
>>>> +    uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
>>>>        uint64_t addr, cur_addr, size;
>>>>        uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
>>>>        uint64_t mem_end = machine->device_memory->base +
>>>> @@ -726,15 +728,24 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
>>>>                nr_entries++;
>>>>            }
>>>>    
>>>> -        /* Entry for DIMM */
>>>> -        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
>>>> -        g_assert(drc);
>>>> -        elem = spapr_get_drconf_cell(size / lmb_size, addr,
>>>> -                                     spapr_drc_index(drc), node,
>>>> -                                     SPAPR_LMB_FLAGS_ASSIGNED);
>>>> +        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
>>>> +            /* Entry for NVDIMM */
>>>> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, addr / scm_block_size);
>>>> +            g_assert(drc);
>>>> +            elem = spapr_get_drconf_cell(size / scm_block_size, addr,
>>>> +                                         spapr_drc_index(drc), -1, 0);
>>>> +            cur_addr = ROUND_UP(addr + size, scm_block_size);
>>>> +        } else {
>>>> +            /* Entry for DIMM */
>>>> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
>>>> +            g_assert(drc);
>>>> +            elem = spapr_get_drconf_cell(size / lmb_size, addr,
>>>> +                                         spapr_drc_index(drc), node,
>>>> +                                         SPAPR_LMB_FLAGS_ASSIGNED);
>>>> +            cur_addr = addr + size;
>>>> +        }
>>>>            QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
>>>>            nr_entries++;
>>>> -        cur_addr = addr + size;
>>>>        }
>>>>    
>>>>        /* Entry for remaining hotpluggable area */
>>>> @@ -1225,6 +1236,42 @@ static void spapr_dt_hypervisor(sPAPRMachineState *spapr, void *fdt)
>>>>        }
>>>>    }
>>>>    
>>>> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset,
>>>> +                                      uint32_t node, uint64_t addr,
>>>> +                                      uint64_t size, uint64_t label_size);
>>>> +static void spapr_create_nvdimm(void *fdt)
>>>> +{
>>>> +    int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
>>>> +    GSList *dimms = NULL;
>>>> +
>>>> +    if (offset < 0) {
>>>> +        offset = fdt_add_subnode(fdt, 0, "persistent-memory");
>>>> +        _FDT(offset);
>>>> +        _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x2)));
>>>> +        _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
>>>> +        _FDT((fdt_setprop_string(fdt, offset, "name", "persistent-memory")));
>>>> +        _FDT((fdt_setprop_string(fdt, offset, "device_type",
>>>> +                                 "ibm,persistent-memory")));
>>>> +    }
>>>> +
>>>> +    /*NB : Add drc-info array here */
>>>> +
>>>> +    /* Create DT entries for cold plugged NVDIMM devices */
>>>> +    dimms = nvdimm_get_device_list();
>>>> +    for (; dimms; dimms = dimms->next) {
>>>> +        NVDIMMDevice *nvdimm = dimms->data;
>>>> +        PCDIMMDevice *di = PC_DIMM(nvdimm);
>>>> +        uint64_t lsize = nvdimm->label_size;
>>>> +        int size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
>>>> +                                           NULL);
>>>> +
>>>> +        spapr_populate_nvdimm_node(fdt, offset, di->node, di->addr,
>>>> +                                   size, lsize);
>>>> +    }
>>>> +    g_slist_free(dimms);
>>>> +    return;
>>>> +}
>>>> +
>>>>    static void *spapr_build_fdt(sPAPRMachineState *spapr)
>>>>    {
>>>>        MachineState *machine = MACHINE(spapr);
>>>> @@ -1348,6 +1395,11 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr)
>>>>            exit(1);
>>>>        }
>>>>    
>>>> +    /* NVDIMM devices */
>>>> +    if (spapr->nvdimm_enabled) {
>>>> +        spapr_create_nvdimm(fdt);
>>>> +    }
>>>> +
>>>>        return fdt;
>>>>    }
>>>>    
>>>> @@ -3143,6 +3195,20 @@ static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
>>>>        }
>>>>    }
>>>>    
>>>> +static bool spapr_get_nvdimm(Object *obj, Error **errp)
>>>> +{
>>>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>>>> +
>>>> +    return spapr->nvdimm_enabled;
>>>> +}
>>>> +
>>>> +static void spapr_set_nvdimm(Object *obj, bool value, Error **errp)
>>>> +{
>>>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>>>> +
>>>> +    spapr->nvdimm_enabled = value;
>>>> +}
>>>> +
>>>>    static void spapr_instance_init(Object *obj)
>>>>    {
>>>>        sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
>>>> @@ -3188,6 +3254,11 @@ static void spapr_instance_init(Object *obj)
>>>>        object_property_set_description(obj, "ic-mode",
>>>>                     "Specifies the interrupt controller mode (xics, xive, dual)",
>>>>                     NULL);
>>>> +    object_property_add_bool(obj, "nvdimm",
>>>> +                            spapr_get_nvdimm, spapr_set_nvdimm, NULL);
>>>> +    object_property_set_description(obj, "nvdimm",
>>>> +                                    "Enable support for nvdimm devices",
>>>> +                                    NULL);
>>>>    }
>>>>    
>>>>    static void spapr_machine_finalizefn(Object *obj)
>>>> @@ -3267,12 +3338,103 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
>>>>        }
>>>>    }
>>>>    
>>>> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset, uint32_t node,
>>>> +                                      uint64_t addr, uint64_t size,
>>>> +                                      uint64_t label_size)
>>>> +{
>>>> +    int offset;
>>>> +    char buf[40];
>>>> +    GString *lcode = g_string_sized_new(10);
>>>> +    sPAPRDRConnector *drc;
>>>> +    QemuUUID uuid;
>>>> +    uint32_t drc_idx;
>>>> +    uint32_t associativity[] = {
>>>> +        cpu_to_be32(0x4), /* length */
>>>> +        cpu_to_be32(0x0), cpu_to_be32(0x0),
>>>> +        cpu_to_be32(0x0), cpu_to_be32(node)
>>>> +    };
>>>> +
>>>> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
>>>> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>>>> +    g_assert(drc);
>>>> +
>>>> +    drc_idx = spapr_drc_index(drc);
>>>> +
>>>> +    sprintf(buf, "pmem@%x", drc_idx);
>>>> +    offset = fdt_add_subnode(fdt, fdt_offset, buf);
>>>> +    _FDT(offset);
>>>> +
>>>> +    _FDT((fdt_setprop_cell(fdt, offset, "reg", drc_idx)));
>>>> +    _FDT((fdt_setprop_string(fdt, offset, "compatible", "ibm,pmemory")));
>>>> +    _FDT((fdt_setprop_string(fdt, offset, "name", "pmem")));
>>>> +    _FDT((fdt_setprop_string(fdt, offset, "device_type", "ibm,pmemory")));
>>>> +
>>>> +    /*NB : Supposed to be random strings. Currently empty 10 strings! */
>>>> +    _FDT((fdt_setprop(fdt, offset, "ibm,loc-code", lcode->str, lcode->len)));
>>>> +    g_string_free(lcode, TRUE);
>>>> +
>>>> +    _FDT((fdt_setprop(fdt, offset, "ibm,associativity", associativity,
>>>> +                      sizeof(associativity))));
>>>> +    g_random_set_seed(drc_idx);
>>>> +    qemu_uuid_generate(&uuid);
>>>> +
>>>> +    qemu_uuid_unparse(&uuid, buf);
>>>> +    _FDT((fdt_setprop_string(fdt, offset, "ibm,unit-guid", buf)));
>>>> +
>>>> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_idx)));
>>>> +
>>>> +    /*NB : What it should be? */
>>>> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,latency-attribute", 828));
>>>> +
>>>> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,block-size",
>>>> +                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
>>>> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,number-of-blocks",
>>>> +                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
>>>> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,metadata-size", label_size)));
>>>> +
>>>> +    return offset;
>>>> +}
>>>> +
>>>> +static void spapr_add_nvdimm(DeviceState *dev, uint64_t addr,
>>>> +                             uint64_t size, uint32_t node,
>>>> +                             Error **errp)
>>>> +{
>>>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_hotplug_handler(dev));
>>>> +    sPAPRDRConnector *drc;
>>>> +    bool hotplugged = spapr_drc_hotplugged(dev);
>>>> +    NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
>>>> +    void *fdt;
>>>> +    int fdt_offset, fdt_size;
>>>> +    Error *local_err = NULL;
>>>> +
>>>> +    spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM,
>>>> +                           addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>>>> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
>>>> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
>>>> +    g_assert(drc);
>>>> +
>>>> +    fdt = create_device_tree(&fdt_size);
>>>> +    fdt_offset = spapr_populate_nvdimm_node(fdt, 0, node, addr,
>>>> +                                            size, nvdimm->label_size);
>>>> +
>>>> +    spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
>>>> +    if (local_err) {
>>>> +        error_propagate(errp, local_err);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    if (hotplugged) {
>>>> +        spapr_hotplug_req_add_by_index(drc);
>>>> +    }
>>>> +}
>>>> +
>>>>    static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>>>                                  Error **errp)
>>>>    {
>>>>        Error *local_err = NULL;
>>>>        sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
>>>>        PCDIMMDevice *dimm = PC_DIMM(dev);
>>>> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>>>>        uint64_t size, addr;
>>>>        uint32_t node;
>>>>    
>>>> @@ -3291,9 +3453,14 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>>>    
>>>>        node = object_property_get_uint(OBJECT(dev), PC_DIMM_NODE_PROP,
>>>>                                        &error_abort);
>>>> -    spapr_add_lmbs(dev, addr, size, node,
>>>> -                   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
>>>> -                   &local_err);
>>>> +    if (!is_nvdimm) {
>>>> +        spapr_add_lmbs(dev, addr, size, node,
>>>> +                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
>>>> +                       &local_err);
>>>> +    } else {
>>>> +        spapr_add_nvdimm(dev, addr, size, node, &local_err);
>>>> +    }
>>>> +
>>>>        if (local_err) {
>>>>            goto out_unplug;
>>>>        }
>>>> @@ -3311,6 +3478,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>>>    {
>>>>        const sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
>>>>        sPAPRMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
>>>> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
>>>>        PCDIMMDevice *dimm = PC_DIMM(dev);
>>>>        Error *local_err = NULL;
>>>>        uint64_t size;
>>>> @@ -3328,10 +3496,30 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>>>>            return;
>>>>        }
>>>>    
>>>> -    if (size % SPAPR_MEMORY_BLOCK_SIZE) {
>>>> +    if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) {
>>>>            error_setg(errp, "Hotplugged memory size must be a multiple of "
>>>> -                      "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
>>>> +                          "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
>>>>            return;
>>>> +    } else if (is_nvdimm) {
>>>> +        NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
>>>> +        if ((nvdimm->label_size + size) % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
>>>> +            error_setg(errp, "NVDIMM memory size must be a multiple of "
>>>> +                       "%" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
>>>> +            return;
>>>> +        }
>>>> +        if (((nvdimm->label_size + size) / SPAPR_MINIMUM_SCM_BLOCK_SIZE) == 1) {
>>>> +            error_setg(errp, "NVDIMM size must be atleast "
>>>> +                       "%" PRIu64 "MB", 2 * SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
>>>> +            return;
>>>> +        }
> on the second glance 2 things looks weird here:
>    1. we shouldn't poke inside of nvdimm object directly, there is NVDIMM_LABEL_SIZE_PROP
>       if you really need to get label size

Ok. Will use the property.

>    2. why do we need to care about label_size here at all?

On PPC, there is no explicit way to specify the size of the NVDIMM 
device for the guest.
It is inferred by the (number of SCM blocks) * (SCM block-size) as 
specified in the
device tree. The label area is part of the nvdimm but not exposed to the 
guest
as you mentioned. So, if user specified size=1GB, and label_size=3MB, 
the qemu
will say (1GB-3MB)/256MB(block size) = 3 number of blocks and block size 
as 256MB.
The user gets 768MB of the device. Since the minimum required size is 256MB,
and label_area being outside, and we want it to be aligned to 256MB, I 
am forcing the minimum
device size to be 512MB. Actually it can be anything above 256MB + 
label_size. I see
your point, I can just not care about label_size but care only about the 
nvdimm size(-label_size)
is aligned to 256MB or not.

>>>> +        /* Align to scm block size, exclude the label */
>>>> +        memory_device_set_region_size(MEMORY_DEVICE(nvdimm),
>>>> +               QEMU_ALIGN_DOWN(size, SPAPR_MINIMUM_SCM_BLOCK_SIZE), &local_err);
>>> I'm not sure that arbitrarily fixing up region size is the right thing to do
>>> and also what you are trying to achieve here isn't clear, could you explain it some more?
>> The resize is required to allow the subsequent memory hotplugs to work. The
>> base address(if not specified) for the next dimm hotplug, starts at the
>> end of
>> this region. If the region is not aligned to LMB size, guest refuses to
>> claim the
>> newly hotplugged memory.  The label area can be small and need not be
>> aligned to (LMB/SCM block) size. The region size is actually the size
>> minus the
>> label_size which can be unaligned to LMB size. So, align down to SCM block
>> size is necessary here.
> Well fixing up object(MemoryRegion) which belongs to the backend from
> machine level to satisfy machine specific alignment requirements looks
> like a wrong thing to do.

For a 1GB device with say 3MB label_size, the qemu exposes 3 SCM blocks 
of 256MB each
and guest actually accesses 768MB of the region, even though the memory 
region size
is (1GB-3MB). But on x86, the guest actually sees (1GB-3MB), not any less.
The memory region size is larger than 768 and is unaligned to 256MB, the
subsequent dimm hotplug would fail as the next free address got from
memory_device_get_free_addr is not aligned to 256MB.

[   35.617767] pseries-hotplug-mem: dlpar_memory: Memory add LMBs
[   35.619598] pseries-hotplug-mem: Attempting to hot-add 1 LMB(s) at 
index 80000040
[   35.619966] pseries-hotplug-mem: Attempting to hot-add in range 
40fe00000 - 40fe00000
[   35.620416] pseries-hotplug-mem: Attempting to hot-add in range 
40fe00000 - 40fe00000
[   35.621330] Block size [0x10000000 or 268435456] unaligned hotplug 
range: start 0x40fe00000, size 0x10000000
[   35.621432] pseries-hotplug-mem: Memory indexed-count-add failed, 
removing any added LMBs

This alignment problem is not unique to Power, I see the same happening 
on x86_64 too as the
memory block size is required to be aligned to 128MB there.

[   26.558423] Block size [0x8000000] unaligned hotplug range: start 
0x11ffe0000, size 0x8000000
[   26.558427] acpi PNP0C80:00: add_memory failed
[   26.558431] acpi PNP0C80:00: acpi_memory_enable_device() error
[   26.558433] acpi PNP0C80:00: Enumeration failure

The user has to circumvent this alignment issue by explicitly giving the 
256MB and 128MB as the align size on the
memory-backend-file object option on PPC and X86_64 respectively.

> So we need to come up with another approach.
> I'm sill not sure what problem is there but nvdimm already
> has a notion of data region (without label size) look for
> nvdimm->nvdimm_mr and mdc->get_memory_region and that's what you have in
> local var 'size'.
>    
>
> So what you are doing here look incorrect even more,
> i.e. beside we shouldn't do it at all and the second thing is that you are
> sizing down data area which already excludes label size.

To get the things(nvdimm & dimm) working together, the user has to give 
align=256m on
memory-backend-file device option for nvdimm device backend object.
With the align option, the nvdimm_prepare_memory_region() does the 
QEMU_ALIGN_DOWN of the
memory region which I am doing here by default. Doing it by default 
still makes sense as the
actual size the guest gets to use is only 768MB in this case. However, I 
should probably do align down
only if the user has not specified the value by himself, and let 
nvdimm_prepare_memory_region() do
it in such a case.

Since this is PPC specific alignment requirement, I think this is the 
right place to enforce it & size down
by default here. There are checks for PPC specific DIMM size alignment 
requirement same way here.
If this is not the right place, could you suggest me a better place as the
nvdimm_prepare_memory_region() is generic and I cant set machine 
specific device properties there?


> What I'd suggest is to align up GPA of being added memory on
>     MAX(LMB size, backend_page_size, max supported huge page size)
> so hotplugged dimm or whatever else would be properly aligned,
> see pc_dimm_pre_plug(,legacy_align,) and how PC uses it.

This approach though assures to give an address aligned to the 
legacy_align mentioned from
memory_device_get_free_addr(), requires the size of the device also to 
be aligned to
the legacy_align specified. If I am not sizing down the region size, the 
size-label_size will not
be aligned to this size.  That is another reason why sizing down the 
region size is still needed.

>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            return;
>>>> +        }
>>>>        }
>>>>    
>>>>        memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
>>>> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
>>>> index 2edb7d1e9c..94ddd102cc 100644
>>>> --- a/hw/ppc/spapr_drc.c
>>>> +++ b/hw/ppc/spapr_drc.c
>>>> @@ -696,6 +696,16 @@ static void spapr_drc_lmb_class_init(ObjectClass *k, void *data)
>>>>        drck->release = spapr_lmb_release;
>>>>    }
>>>>    
>>>> +static void spapr_drc_pmem_class_init(ObjectClass *k, void *data)
>>>> +{
>>>> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_CLASS(k);
>>>> +
>>>> +    drck->typeshift = SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM;
>>>> +    drck->typename = "MEM";
>>>> +    drck->drc_name_prefix = "PMEM ";
>>>> +    drck->release = NULL;
>>>> +}
>>>> +
>>>>    static const TypeInfo spapr_dr_connector_info = {
>>>>        .name          = TYPE_SPAPR_DR_CONNECTOR,
>>>>        .parent        = TYPE_DEVICE,
>>>> @@ -739,6 +749,12 @@ static const TypeInfo spapr_drc_lmb_info = {
>>>>        .class_init    = spapr_drc_lmb_class_init,
>>>>    };
>>>>    
>>>> +static const TypeInfo spapr_drc_pmem_info = {
>>>> +    .name          = TYPE_SPAPR_DRC_PMEM,
>>>> +    .parent        = TYPE_SPAPR_DRC_LOGICAL,
>>>> +    .class_init    = spapr_drc_pmem_class_init,
>>>> +};
>>>> +
>>>>    /* helper functions for external users */
>>>>    
>>>>    sPAPRDRConnector *spapr_drc_by_index(uint32_t index)
>>>> @@ -1189,6 +1205,7 @@ static void spapr_drc_register_types(void)
>>>>        type_register_static(&spapr_drc_cpu_info);
>>>>        type_register_static(&spapr_drc_pci_info);
>>>>        type_register_static(&spapr_drc_lmb_info);
>>>> +    type_register_static(&spapr_drc_pmem_info);
>>>>    
>>>>        spapr_rtas_register(RTAS_SET_INDICATOR, "set-indicator",
>>>>                            rtas_set_indicator);
>>>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>>>> index 32719a1b72..a4fed84346 100644
>>>> --- a/hw/ppc/spapr_events.c
>>>> +++ b/hw/ppc/spapr_events.c
>>>> @@ -193,6 +193,7 @@ struct rtas_event_log_v6_hp {
>>>>    #define RTAS_LOG_V6_HP_TYPE_SLOT                         3
>>>>    #define RTAS_LOG_V6_HP_TYPE_PHB                          4
>>>>    #define RTAS_LOG_V6_HP_TYPE_PCI                          5
>>>> +#define RTAS_LOG_V6_HP_TYPE_PMEM                         6
>>>>        uint8_t hotplug_action;
>>>>    #define RTAS_LOG_V6_HP_ACTION_ADD                        1
>>>>    #define RTAS_LOG_V6_HP_ACTION_REMOVE                     2
>>>> @@ -526,6 +527,9 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
>>>>        case SPAPR_DR_CONNECTOR_TYPE_CPU:
>>>>            hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_CPU;
>>>>            break;
>>>> +    case SPAPR_DR_CONNECTOR_TYPE_PMEM:
>>>> +        hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PMEM;
>>>> +        break;
>>>>        default:
>>>>            /* we shouldn't be signaling hotplug events for resources
>>>>             * that don't support them
>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>> index a947a0a0dc..21a9709afe 100644
>>>> --- a/include/hw/ppc/spapr.h
>>>> +++ b/include/hw/ppc/spapr.h
>>>> @@ -187,6 +187,7 @@ struct sPAPRMachineState {
>>>>    
>>>>        bool cmd_line_caps[SPAPR_CAP_NUM];
>>>>        sPAPRCapabilities def, eff, mig;
>>>> +    bool nvdimm_enabled;
>>>>    };
>>>>    
>>>>    #define H_SUCCESS         0
>>>> @@ -798,6 +799,15 @@ int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
>>>>    #define SPAPR_LMB_FLAGS_DRC_INVALID 0x00000020
>>>>    #define SPAPR_LMB_FLAGS_RESERVED 0x00000080
>>>>    
>>>> +/*
>>>> + * The nvdimm size should be aligned to SCM block size.
>>>> + * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
>>>> + * inorder to have SCM regions not to overlap with dimm memory regions.
>>>> + * The SCM devices can have variable block sizes. For now, fixing the
>>>> + * block size to the minimum value.
>>>> + */
>>>> +#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
>>>> +
>>>>    void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg);
>>>>    
>>>>    #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
>>>> diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
>>>> index f6ff32e7e2..65925d00b1 100644
>>>> --- a/include/hw/ppc/spapr_drc.h
>>>> +++ b/include/hw/ppc/spapr_drc.h
>>>> @@ -70,6 +70,13 @@
>>>>    #define SPAPR_DRC_LMB(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
>>>>                                            TYPE_SPAPR_DRC_LMB)
>>>>    
>>>> +#define TYPE_SPAPR_DRC_PMEM "spapr-drc-pmem"
>>>> +#define SPAPR_DRC_PMEM_GET_CLASS(obj) \
>>>> +        OBJECT_GET_CLASS(sPAPRDRConnectorClass, obj, TYPE_SPAPR_DRC_PMEM)
>>>> +#define SPAPR_DRC_PMEM_CLASS(klass) \
>>>> +        OBJECT_CLASS_CHECK(sPAPRDRConnectorClass, klass, TYPE_SPAPR_DRC_PMEM)
>>>> +#define SPAPR_DRC_PMEM(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
>>>> +                                        TYPE_SPAPR_DRC_PMEM)
>>>>    /*
>>>>     * Various hotplug types managed by sPAPRDRConnector
>>>>     *
>>>> @@ -87,6 +94,7 @@ typedef enum {
>>>>        SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO = 3,
>>>>        SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI = 4,
>>>>        SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB = 8,
>>>> +    SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM = 9,
>>>>    } sPAPRDRConnectorTypeShift;
>>>>    
>>>>    typedef enum {
>>>> @@ -96,6 +104,7 @@ typedef enum {
>>>>        SPAPR_DR_CONNECTOR_TYPE_VIO = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO,
>>>>        SPAPR_DR_CONNECTOR_TYPE_PCI = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI,
>>>>        SPAPR_DR_CONNECTOR_TYPE_LMB = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB,
>>>> +    SPAPR_DR_CONNECTOR_TYPE_PMEM = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM,
>>>>    } sPAPRDRConnectorType;
>>>>    
>>>>    /*
>>>>
>>>>
Igor Mammedov March 5, 2019, 9:13 a.m. UTC | #10
On Thu, 28 Feb 2019 14:24:07 +0530
Shivaprasad G Bhat <sbhat@linux.ibm.com> wrote:

> Hi Igor,
> 
> Thanks for the elaboration. Please find my response inline.
> 
> 
> On 02/21/2019 07:42 PM, Igor Mammedov wrote:
> > On Tue, 19 Feb 2019 14:59:25 +0530
> > Shivaprasad G Bhat <sbhat@linux.ibm.com> wrote:
> >  
> >> On 02/19/2019 01:41 PM, Igor Mammedov wrote:  
> >>> On Tue, 05 Feb 2019 23:26:27 -0600
> >>> Shivaprasad G Bhat <sbhat@linux.ibm.com> wrote:
> >>>     
> >>>> Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
> >>>> device interface in QEMU to support virtual NVDIMM devices for Power (May have
> >>>> to re-look at this later).  Create the required DT entries for the
> >>>> device (some entries have dummy values right now).
> >>>>
> >>>> The patch creates the required DT node and sends a hotplug
> >>>> interrupt to the guest. Guest is expected to undertake the normal
> >>>> DR resource add path in response and start issuing PAPR SCM hcalls.
> >>>>
> >>>> This is how it can be used ..
> >>>> Add nvdimm=on to the qemu machine argument.
> >>>> Ex : -machine pseries,nvdimm=on
> >>>> For coldplug, the device to be added in qemu command line as shown below
> >>>> -object memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
> >>>> -device nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
> >>>>
> >>>> For hotplug, the device to be added from monitor as below
> >>>> object_add memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0.img,share=yes,size=512m
> >>>> device_add nvdimm,label-size=128k,memdev=memnvdimm0,id=nvdimm0,slot=0
> >>>>
> >>>> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
> >>>> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> >>>>                  [Early implementation]
> >>>> ---
> >>>>    default-configs/ppc64-softmmu.mak |    1
> >>>>    hw/ppc/spapr.c                    |  212 +++++++++++++++++++++++++++++++++++--
> >>>>    hw/ppc/spapr_drc.c                |   17 +++
> >>>>    hw/ppc/spapr_events.c             |    4 +
> >>>>    include/hw/ppc/spapr.h            |   10 ++
> >>>>    include/hw/ppc/spapr_drc.h        |    9 ++
> >>>>    6 files changed, 241 insertions(+), 12 deletions(-)
> >>>>
> >>>> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> >>>> index 7f34ad0528..b6e1aa5125 100644
> >>>> --- a/default-configs/ppc64-softmmu.mak
> >>>> +++ b/default-configs/ppc64-softmmu.mak
> >>>> @@ -20,4 +20,5 @@ CONFIG_XIVE=$(CONFIG_PSERIES)
> >>>>    CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
> >>>>    CONFIG_MEM_DEVICE=y
> >>>>    CONFIG_DIMM=y
> >>>> +CONFIG_NVDIMM=y
> >>>>    CONFIG_SPAPR_RNG=y
> >>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >>>> index 0fcdd35cbe..7e7a1a8041 100644
> >>>> --- a/hw/ppc/spapr.c
> >>>> +++ b/hw/ppc/spapr.c
> >>>> @@ -73,6 +73,7 @@
> >>>>    #include "qemu/cutils.h"
> >>>>    #include "hw/ppc/spapr_cpu_core.h"
> >>>>    #include "hw/mem/memory-device.h"
> >>>> +#include "hw/mem/nvdimm.h"
> >>>>    
> >>>>    #include <libfdt.h>
> >>>>    
> >>>> @@ -690,6 +691,7 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
> >>>>        uint8_t *int_buf, *cur_index, buf_len;
> >>>>        int ret;
> >>>>        uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> >>>> +    uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> >>>>        uint64_t addr, cur_addr, size;
> >>>>        uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
> >>>>        uint64_t mem_end = machine->device_memory->base +
> >>>> @@ -726,15 +728,24 @@ static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
> >>>>                nr_entries++;
> >>>>            }
> >>>>    
> >>>> -        /* Entry for DIMM */
> >>>> -        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> >>>> -        g_assert(drc);
> >>>> -        elem = spapr_get_drconf_cell(size / lmb_size, addr,
> >>>> -                                     spapr_drc_index(drc), node,
> >>>> -                                     SPAPR_LMB_FLAGS_ASSIGNED);
> >>>> +        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
> >>>> +            /* Entry for NVDIMM */
> >>>> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, addr / scm_block_size);
> >>>> +            g_assert(drc);
> >>>> +            elem = spapr_get_drconf_cell(size / scm_block_size, addr,
> >>>> +                                         spapr_drc_index(drc), -1, 0);
> >>>> +            cur_addr = ROUND_UP(addr + size, scm_block_size);
> >>>> +        } else {
> >>>> +            /* Entry for DIMM */
> >>>> +            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
> >>>> +            g_assert(drc);
> >>>> +            elem = spapr_get_drconf_cell(size / lmb_size, addr,
> >>>> +                                         spapr_drc_index(drc), node,
> >>>> +                                         SPAPR_LMB_FLAGS_ASSIGNED);
> >>>> +            cur_addr = addr + size;
> >>>> +        }
> >>>>            QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
> >>>>            nr_entries++;
> >>>> -        cur_addr = addr + size;
> >>>>        }
> >>>>    
> >>>>        /* Entry for remaining hotpluggable area */
> >>>> @@ -1225,6 +1236,42 @@ static void spapr_dt_hypervisor(sPAPRMachineState *spapr, void *fdt)
> >>>>        }
> >>>>    }
> >>>>    
> >>>> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset,
> >>>> +                                      uint32_t node, uint64_t addr,
> >>>> +                                      uint64_t size, uint64_t label_size);
> >>>> +static void spapr_create_nvdimm(void *fdt)
> >>>> +{
> >>>> +    int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
> >>>> +    GSList *dimms = NULL;
> >>>> +
> >>>> +    if (offset < 0) {
> >>>> +        offset = fdt_add_subnode(fdt, 0, "persistent-memory");
> >>>> +        _FDT(offset);
> >>>> +        _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x2)));
> >>>> +        _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
> >>>> +        _FDT((fdt_setprop_string(fdt, offset, "name", "persistent-memory")));
> >>>> +        _FDT((fdt_setprop_string(fdt, offset, "device_type",
> >>>> +                                 "ibm,persistent-memory")));
> >>>> +    }
> >>>> +
> >>>> +    /*NB : Add drc-info array here */
> >>>> +
> >>>> +    /* Create DT entries for cold plugged NVDIMM devices */
> >>>> +    dimms = nvdimm_get_device_list();
> >>>> +    for (; dimms; dimms = dimms->next) {
> >>>> +        NVDIMMDevice *nvdimm = dimms->data;
> >>>> +        PCDIMMDevice *di = PC_DIMM(nvdimm);
> >>>> +        uint64_t lsize = nvdimm->label_size;
> >>>> +        int size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
> >>>> +                                           NULL);
> >>>> +
> >>>> +        spapr_populate_nvdimm_node(fdt, offset, di->node, di->addr,
> >>>> +                                   size, lsize);
> >>>> +    }
> >>>> +    g_slist_free(dimms);
> >>>> +    return;
> >>>> +}
> >>>> +
> >>>>    static void *spapr_build_fdt(sPAPRMachineState *spapr)
> >>>>    {
> >>>>        MachineState *machine = MACHINE(spapr);
> >>>> @@ -1348,6 +1395,11 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr)
> >>>>            exit(1);
> >>>>        }
> >>>>    
> >>>> +    /* NVDIMM devices */
> >>>> +    if (spapr->nvdimm_enabled) {
> >>>> +        spapr_create_nvdimm(fdt);
> >>>> +    }
> >>>> +
> >>>>        return fdt;
> >>>>    }
> >>>>    
> >>>> @@ -3143,6 +3195,20 @@ static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
> >>>>        }
> >>>>    }
> >>>>    
> >>>> +static bool spapr_get_nvdimm(Object *obj, Error **errp)
> >>>> +{
> >>>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> >>>> +
> >>>> +    return spapr->nvdimm_enabled;
> >>>> +}
> >>>> +
> >>>> +static void spapr_set_nvdimm(Object *obj, bool value, Error **errp)
> >>>> +{
> >>>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> >>>> +
> >>>> +    spapr->nvdimm_enabled = value;
> >>>> +}
> >>>> +
> >>>>    static void spapr_instance_init(Object *obj)
> >>>>    {
> >>>>        sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
> >>>> @@ -3188,6 +3254,11 @@ static void spapr_instance_init(Object *obj)
> >>>>        object_property_set_description(obj, "ic-mode",
> >>>>                     "Specifies the interrupt controller mode (xics, xive, dual)",
> >>>>                     NULL);
> >>>> +    object_property_add_bool(obj, "nvdimm",
> >>>> +                            spapr_get_nvdimm, spapr_set_nvdimm, NULL);
> >>>> +    object_property_set_description(obj, "nvdimm",
> >>>> +                                    "Enable support for nvdimm devices",
> >>>> +                                    NULL);
> >>>>    }
> >>>>    
> >>>>    static void spapr_machine_finalizefn(Object *obj)
> >>>> @@ -3267,12 +3338,103 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
> >>>>        }
> >>>>    }
> >>>>    
> >>>> +static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset, uint32_t node,
> >>>> +                                      uint64_t addr, uint64_t size,
> >>>> +                                      uint64_t label_size)
> >>>> +{
> >>>> +    int offset;
> >>>> +    char buf[40];
> >>>> +    GString *lcode = g_string_sized_new(10);
> >>>> +    sPAPRDRConnector *drc;
> >>>> +    QemuUUID uuid;
> >>>> +    uint32_t drc_idx;
> >>>> +    uint32_t associativity[] = {
> >>>> +        cpu_to_be32(0x4), /* length */
> >>>> +        cpu_to_be32(0x0), cpu_to_be32(0x0),
> >>>> +        cpu_to_be32(0x0), cpu_to_be32(node)
> >>>> +    };
> >>>> +
> >>>> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
> >>>> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> >>>> +    g_assert(drc);
> >>>> +
> >>>> +    drc_idx = spapr_drc_index(drc);
> >>>> +
> >>>> +    sprintf(buf, "pmem@%x", drc_idx);
> >>>> +    offset = fdt_add_subnode(fdt, fdt_offset, buf);
> >>>> +    _FDT(offset);
> >>>> +
> >>>> +    _FDT((fdt_setprop_cell(fdt, offset, "reg", drc_idx)));
> >>>> +    _FDT((fdt_setprop_string(fdt, offset, "compatible", "ibm,pmemory")));
> >>>> +    _FDT((fdt_setprop_string(fdt, offset, "name", "pmem")));
> >>>> +    _FDT((fdt_setprop_string(fdt, offset, "device_type", "ibm,pmemory")));
> >>>> +
> >>>> +    /*NB : Supposed to be random strings. Currently empty 10 strings! */
> >>>> +    _FDT((fdt_setprop(fdt, offset, "ibm,loc-code", lcode->str, lcode->len)));
> >>>> +    g_string_free(lcode, TRUE);
> >>>> +
> >>>> +    _FDT((fdt_setprop(fdt, offset, "ibm,associativity", associativity,
> >>>> +                      sizeof(associativity))));
> >>>> +    g_random_set_seed(drc_idx);
> >>>> +    qemu_uuid_generate(&uuid);
> >>>> +
> >>>> +    qemu_uuid_unparse(&uuid, buf);
> >>>> +    _FDT((fdt_setprop_string(fdt, offset, "ibm,unit-guid", buf)));
> >>>> +
> >>>> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_idx)));
> >>>> +
> >>>> +    /*NB : What it should be? */
> >>>> +    _FDT(fdt_setprop_cell(fdt, offset, "ibm,latency-attribute", 828));
> >>>> +
> >>>> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,block-size",
> >>>> +                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> >>>> +    _FDT((fdt_setprop_u64(fdt, offset, "ibm,number-of-blocks",
> >>>> +                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
> >>>> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,metadata-size", label_size)));
> >>>> +
> >>>> +    return offset;
> >>>> +}
> >>>> +
> >>>> +static void spapr_add_nvdimm(DeviceState *dev, uint64_t addr,
> >>>> +                             uint64_t size, uint32_t node,
> >>>> +                             Error **errp)
> >>>> +{
> >>>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_hotplug_handler(dev));
> >>>> +    sPAPRDRConnector *drc;
> >>>> +    bool hotplugged = spapr_drc_hotplugged(dev);
> >>>> +    NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
> >>>> +    void *fdt;
> >>>> +    int fdt_offset, fdt_size;
> >>>> +    Error *local_err = NULL;
> >>>> +
> >>>> +    spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM,
> >>>> +                           addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> >>>> +    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
> >>>> +                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
> >>>> +    g_assert(drc);
> >>>> +
> >>>> +    fdt = create_device_tree(&fdt_size);
> >>>> +    fdt_offset = spapr_populate_nvdimm_node(fdt, 0, node, addr,
> >>>> +                                            size, nvdimm->label_size);
> >>>> +
> >>>> +    spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
> >>>> +    if (local_err) {
> >>>> +        error_propagate(errp, local_err);
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    if (hotplugged) {
> >>>> +        spapr_hotplug_req_add_by_index(drc);
> >>>> +    }
> >>>> +}
> >>>> +
> >>>>    static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> >>>>                                  Error **errp)
> >>>>    {
> >>>>        Error *local_err = NULL;
> >>>>        sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
> >>>>        PCDIMMDevice *dimm = PC_DIMM(dev);
> >>>> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
> >>>>        uint64_t size, addr;
> >>>>        uint32_t node;
> >>>>    
> >>>> @@ -3291,9 +3453,14 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> >>>>    
> >>>>        node = object_property_get_uint(OBJECT(dev), PC_DIMM_NODE_PROP,
> >>>>                                        &error_abort);
> >>>> -    spapr_add_lmbs(dev, addr, size, node,
> >>>> -                   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> >>>> -                   &local_err);
> >>>> +    if (!is_nvdimm) {
> >>>> +        spapr_add_lmbs(dev, addr, size, node,
> >>>> +                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
> >>>> +                       &local_err);
> >>>> +    } else {
> >>>> +        spapr_add_nvdimm(dev, addr, size, node, &local_err);
> >>>> +    }
> >>>> +
> >>>>        if (local_err) {
> >>>>            goto out_unplug;
> >>>>        }
> >>>> @@ -3311,6 +3478,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> >>>>    {
> >>>>        const sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
> >>>>        sPAPRMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
> >>>> +    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
> >>>>        PCDIMMDevice *dimm = PC_DIMM(dev);
> >>>>        Error *local_err = NULL;
> >>>>        uint64_t size;
> >>>> @@ -3328,10 +3496,30 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> >>>>            return;
> >>>>        }
> >>>>    
> >>>> -    if (size % SPAPR_MEMORY_BLOCK_SIZE) {
> >>>> +    if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) {
> >>>>            error_setg(errp, "Hotplugged memory size must be a multiple of "
> >>>> -                      "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
> >>>> +                          "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
> >>>>            return;
> >>>> +    } else if (is_nvdimm) {
> >>>> +        NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
> >>>> +        if ((nvdimm->label_size + size) % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
> >>>> +            error_setg(errp, "NVDIMM memory size must be a multiple of "
> >>>> +                       "%" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
> >>>> +            return;
> >>>> +        }
> >>>> +        if (((nvdimm->label_size + size) / SPAPR_MINIMUM_SCM_BLOCK_SIZE) == 1) {
> >>>> +            error_setg(errp, "NVDIMM size must be atleast "
> >>>> +                       "%" PRIu64 "MB", 2 * SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
> >>>> +            return;
> >>>> +        }  
> > on the second glance 2 things looks weird here:
> >    1. we shouldn't poke inside of nvdimm object directly, there is NVDIMM_LABEL_SIZE_PROP
> >       if you really need to get label size  
> 
> Ok. Will use the property.
> 
> >    2. why do we need to care about label_size here at all?  
> 
> On PPC, there is no explicit way to specify the size of the NVDIMM 
> device for the guest.
> It is inferred by the (number of SCM blocks) * (SCM block-size) as 
> specified in the
> device tree. The label area is part of the nvdimm but not exposed to the 
> guest
> as you mentioned. So, if user specified size=1GB, and label_size=3MB, 
> the qemu
> will say (1GB-3MB)/256MB(block size) = 3 number of blocks and block size 
> as 256MB.
> The user gets 768MB of the device. Since the minimum required size is 256MB,
> and label_area being outside, and we want it to be aligned to 256MB, I 
> am forcing the minimum
> device size to be 512MB. Actually it can be anything above 256MB + 
> label_size. I see
> your point, I can just not care about label_size but care only about the 
> nvdimm size(-label_size)
> is aligned to 256MB or not.
> 
> >>>> +        /* Align to scm block size, exclude the label */
> >>>> +        memory_device_set_region_size(MEMORY_DEVICE(nvdimm),
> >>>> +               QEMU_ALIGN_DOWN(size, SPAPR_MINIMUM_SCM_BLOCK_SIZE), &local_err);  
> >>> I'm not sure that arbitrarily fixing up region size is the right thing to do
> >>> and also what you are trying to achieve here isn't clear, could you explain it some more?  
> >> The resize is required to allow the subsequent memory hotplugs to work. The
> >> base address(if not specified) for the next dimm hotplug, starts at the
> >> end of
> >> this region. If the region is not aligned to LMB size, guest refuses to
> >> claim the
> >> newly hotplugged memory.  The label area can be small and need not be
> >> aligned to (LMB/SCM block) size. The region size is actually the size
> >> minus the
> >> label_size which can be unaligned to LMB size. So, align down to SCM block
> >> size is necessary here.  
> > Well fixing up object(MemoryRegion) which belongs to the backend from
> > machine level to satisfy machine specific alignment requirements looks
> > like a wrong thing to do.  
> 
> For a 1GB device with say 3MB label_size, the qemu exposes 3 SCM blocks 
> of 256MB each
> and guest actually accesses 768MB of the region, even though the memory 
> region size
> is (1GB-3MB). But on x86, the guest actually sees (1GB-3MB), not any less.
> The memory region size is larger than 768 and is unaligned to 256MB, the
> subsequent dimm hotplug would fail as the next free address got from
> memory_device_get_free_addr is not aligned to 256MB.
> 
> [   35.617767] pseries-hotplug-mem: dlpar_memory: Memory add LMBs
> [   35.619598] pseries-hotplug-mem: Attempting to hot-add 1 LMB(s) at 
> index 80000040
> [   35.619966] pseries-hotplug-mem: Attempting to hot-add in range 
> 40fe00000 - 40fe00000
> [   35.620416] pseries-hotplug-mem: Attempting to hot-add in range 
> 40fe00000 - 40fe00000
> [   35.621330] Block size [0x10000000 or 268435456] unaligned hotplug 
> range: start 0x40fe00000, size 0x10000000
> [   35.621432] pseries-hotplug-mem: Memory indexed-count-add failed, 
> removing any added LMBs
> 
> This alignment problem is not unique to Power, I see the same happening 
> on x86_64 too as the
> memory block size is required to be aligned to 128MB there.
> 
> [   26.558423] Block size [0x8000000] unaligned hotplug range: start 
> 0x11ffe0000, size 0x8000000
> [   26.558427] acpi PNP0C80:00: add_memory failed
> [   26.558431] acpi PNP0C80:00: acpi_memory_enable_device() error
> [   26.558433] acpi PNP0C80:00: Enumeration failure
> 
> The user has to circumvent this alignment issue by explicitly giving the 
> 256MB and 128MB as the align size on the
> memory-backend-file object option on PPC and X86_64 respectively.
> 
> > So we need to come up with another approach.
> > I'm sill not sure what problem is there but nvdimm already
> > has a notion of data region (without label size) look for
> > nvdimm->nvdimm_mr and mdc->get_memory_region and that's what you have in
> > local var 'size'.
> >    
> >
> > So what you are doing here look incorrect even more,
> > i.e. beside we shouldn't do it at all and the second thing is that you are
> > sizing down data area which already excludes label size.  
> 
> To get the things(nvdimm & dimm) working together, the user has to give 
> align=256m on
> memory-backend-file device option for nvdimm device backend object.
> With the align option, the nvdimm_prepare_memory_region() does the 
> QEMU_ALIGN_DOWN of the
> memory region which I am doing here by default. Doing it by default 
> still makes sense as the
> actual size the guest gets to use is only 768MB in this case. However, I 
> should probably do align down
> only if the user has not specified the value by himself, and let 
> nvdimm_prepare_memory_region() do
> it in such a case.
> 
> Since this is PPC specific alignment requirement, I think this is the 
> right place to enforce it & size down
> by default here. There are checks for PPC specific DIMM size alignment 
> requirement same way here.
> If this is not the right place, could you suggest me a better place as the
> nvdimm_prepare_memory_region() is generic and I cant set machine 
> specific device properties there?
> 
> 
> > What I'd suggest is to align up GPA of being added memory on
> >     MAX(LMB size, backend_page_size, max supported huge page size)
> > so hotplugged dimm or whatever else would be properly aligned,
> > see pc_dimm_pre_plug(,legacy_align,) and how PC uses it.  
> 
> This approach though assures to give an address aligned to the 
> legacy_align mentioned from
> memory_device_get_free_addr(), requires the size of the device also to 
> be aligned to
> the legacy_align specified. If I am not sizing down the region size, the 
> size-label_size will not
> be aligned to this size.  That is another reason why sizing down the 
> region size is still needed.
The thing with size alignment is that it's a guest specific requirement
that varies depending on OS is running inside. So if spec doesn't specify
alignment I'd look at backend page size as such. It's upto mgmt layer to
configure size properly as upper stack should be aware of which guest it runs.

In case of SPAPR, is size alignment an architectural requirement or it's just
specific guest impl?
If it's the former I'd replace sizing down with a check and refuse improperly
sized nvdimm in case of the later I'd let mgmt to properly pick size depending
on guest OS.
diff mbox series

Patch

diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
index 7f34ad0528..b6e1aa5125 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -20,4 +20,5 @@  CONFIG_XIVE=$(CONFIG_PSERIES)
 CONFIG_XIVE_SPAPR=$(CONFIG_PSERIES)
 CONFIG_MEM_DEVICE=y
 CONFIG_DIMM=y
+CONFIG_NVDIMM=y
 CONFIG_SPAPR_RNG=y
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 0fcdd35cbe..7e7a1a8041 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -73,6 +73,7 @@ 
 #include "qemu/cutils.h"
 #include "hw/ppc/spapr_cpu_core.h"
 #include "hw/mem/memory-device.h"
+#include "hw/mem/nvdimm.h"
 
 #include <libfdt.h>
 
@@ -690,6 +691,7 @@  static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
     uint8_t *int_buf, *cur_index, buf_len;
     int ret;
     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
+    uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
     uint64_t addr, cur_addr, size;
     uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
     uint64_t mem_end = machine->device_memory->base +
@@ -726,15 +728,24 @@  static int spapr_populate_drmem_v2(sPAPRMachineState *spapr, void *fdt,
             nr_entries++;
         }
 
-        /* Entry for DIMM */
-        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
-        g_assert(drc);
-        elem = spapr_get_drconf_cell(size / lmb_size, addr,
-                                     spapr_drc_index(drc), node,
-                                     SPAPR_LMB_FLAGS_ASSIGNED);
+        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
+            /* Entry for NVDIMM */
+            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, addr / scm_block_size);
+            g_assert(drc);
+            elem = spapr_get_drconf_cell(size / scm_block_size, addr,
+                                         spapr_drc_index(drc), -1, 0);
+            cur_addr = ROUND_UP(addr + size, scm_block_size);
+        } else {
+            /* Entry for DIMM */
+            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
+            g_assert(drc);
+            elem = spapr_get_drconf_cell(size / lmb_size, addr,
+                                         spapr_drc_index(drc), node,
+                                         SPAPR_LMB_FLAGS_ASSIGNED);
+            cur_addr = addr + size;
+        }
         QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
         nr_entries++;
-        cur_addr = addr + size;
     }
 
     /* Entry for remaining hotpluggable area */
@@ -1225,6 +1236,42 @@  static void spapr_dt_hypervisor(sPAPRMachineState *spapr, void *fdt)
     }
 }
 
+static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset,
+                                      uint32_t node, uint64_t addr,
+                                      uint64_t size, uint64_t label_size);
+static void spapr_create_nvdimm(void *fdt)
+{
+    int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
+    GSList *dimms = NULL;
+
+    if (offset < 0) {
+        offset = fdt_add_subnode(fdt, 0, "persistent-memory");
+        _FDT(offset);
+        _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x2)));
+        _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
+        _FDT((fdt_setprop_string(fdt, offset, "name", "persistent-memory")));
+        _FDT((fdt_setprop_string(fdt, offset, "device_type",
+                                 "ibm,persistent-memory")));
+    }
+
+    /*NB : Add drc-info array here */
+
+    /* Create DT entries for cold plugged NVDIMM devices */
+    dimms = nvdimm_get_device_list();
+    for (; dimms; dimms = dimms->next) {
+        NVDIMMDevice *nvdimm = dimms->data;
+        PCDIMMDevice *di = PC_DIMM(nvdimm);
+        uint64_t lsize = nvdimm->label_size;
+        int size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
+                                           NULL);
+
+        spapr_populate_nvdimm_node(fdt, offset, di->node, di->addr,
+                                   size, lsize);
+    }
+    g_slist_free(dimms);
+    return;
+}
+
 static void *spapr_build_fdt(sPAPRMachineState *spapr)
 {
     MachineState *machine = MACHINE(spapr);
@@ -1348,6 +1395,11 @@  static void *spapr_build_fdt(sPAPRMachineState *spapr)
         exit(1);
     }
 
+    /* NVDIMM devices */
+    if (spapr->nvdimm_enabled) {
+        spapr_create_nvdimm(fdt);
+    }
+
     return fdt;
 }
 
@@ -3143,6 +3195,20 @@  static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
     }
 }
 
+static bool spapr_get_nvdimm(Object *obj, Error **errp)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+
+    return spapr->nvdimm_enabled;
+}
+
+static void spapr_set_nvdimm(Object *obj, bool value, Error **errp)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+
+    spapr->nvdimm_enabled = value;
+}
+
 static void spapr_instance_init(Object *obj)
 {
     sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
@@ -3188,6 +3254,11 @@  static void spapr_instance_init(Object *obj)
     object_property_set_description(obj, "ic-mode",
                  "Specifies the interrupt controller mode (xics, xive, dual)",
                  NULL);
+    object_property_add_bool(obj, "nvdimm",
+                            spapr_get_nvdimm, spapr_set_nvdimm, NULL);
+    object_property_set_description(obj, "nvdimm",
+                                    "Enable support for nvdimm devices",
+                                    NULL);
 }
 
 static void spapr_machine_finalizefn(Object *obj)
@@ -3267,12 +3338,103 @@  static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
     }
 }
 
+static int spapr_populate_nvdimm_node(void *fdt, int fdt_offset, uint32_t node,
+                                      uint64_t addr, uint64_t size,
+                                      uint64_t label_size)
+{
+    int offset;
+    char buf[40];
+    GString *lcode = g_string_sized_new(10);
+    sPAPRDRConnector *drc;
+    QemuUUID uuid;
+    uint32_t drc_idx;
+    uint32_t associativity[] = {
+        cpu_to_be32(0x4), /* length */
+        cpu_to_be32(0x0), cpu_to_be32(0x0),
+        cpu_to_be32(0x0), cpu_to_be32(node)
+    };
+
+    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
+                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
+    g_assert(drc);
+
+    drc_idx = spapr_drc_index(drc);
+
+    sprintf(buf, "pmem@%x", drc_idx);
+    offset = fdt_add_subnode(fdt, fdt_offset, buf);
+    _FDT(offset);
+
+    _FDT((fdt_setprop_cell(fdt, offset, "reg", drc_idx)));
+    _FDT((fdt_setprop_string(fdt, offset, "compatible", "ibm,pmemory")));
+    _FDT((fdt_setprop_string(fdt, offset, "name", "pmem")));
+    _FDT((fdt_setprop_string(fdt, offset, "device_type", "ibm,pmemory")));
+
+    /*NB : Supposed to be random strings. Currently empty 10 strings! */
+    _FDT((fdt_setprop(fdt, offset, "ibm,loc-code", lcode->str, lcode->len)));
+    g_string_free(lcode, TRUE);
+
+    _FDT((fdt_setprop(fdt, offset, "ibm,associativity", associativity,
+                      sizeof(associativity))));
+    g_random_set_seed(drc_idx);
+    qemu_uuid_generate(&uuid);
+
+    qemu_uuid_unparse(&uuid, buf);
+    _FDT((fdt_setprop_string(fdt, offset, "ibm,unit-guid", buf)));
+
+    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_idx)));
+
+    /*NB : What it should be? */
+    _FDT(fdt_setprop_cell(fdt, offset, "ibm,latency-attribute", 828));
+
+    _FDT((fdt_setprop_u64(fdt, offset, "ibm,block-size",
+                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
+    _FDT((fdt_setprop_u64(fdt, offset, "ibm,number-of-blocks",
+                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
+    _FDT((fdt_setprop_cell(fdt, offset, "ibm,metadata-size", label_size)));
+
+    return offset;
+}
+
+static void spapr_add_nvdimm(DeviceState *dev, uint64_t addr,
+                             uint64_t size, uint32_t node,
+                             Error **errp)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_hotplug_handler(dev));
+    sPAPRDRConnector *drc;
+    bool hotplugged = spapr_drc_hotplugged(dev);
+    NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
+    void *fdt;
+    int fdt_offset, fdt_size;
+    Error *local_err = NULL;
+
+    spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM,
+                           addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
+    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM,
+                          addr / SPAPR_MINIMUM_SCM_BLOCK_SIZE);
+    g_assert(drc);
+
+    fdt = create_device_tree(&fdt_size);
+    fdt_offset = spapr_populate_nvdimm_node(fdt, 0, node, addr,
+                                            size, nvdimm->label_size);
+
+    spapr_drc_attach(drc, dev, fdt, fdt_offset, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    if (hotplugged) {
+        spapr_hotplug_req_add_by_index(drc);
+    }
+}
+
 static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
                               Error **errp)
 {
     Error *local_err = NULL;
     sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
     PCDIMMDevice *dimm = PC_DIMM(dev);
+    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
     uint64_t size, addr;
     uint32_t node;
 
@@ -3291,9 +3453,14 @@  static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
 
     node = object_property_get_uint(OBJECT(dev), PC_DIMM_NODE_PROP,
                                     &error_abort);
-    spapr_add_lmbs(dev, addr, size, node,
-                   spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
-                   &local_err);
+    if (!is_nvdimm) {
+        spapr_add_lmbs(dev, addr, size, node,
+                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
+                       &local_err);
+    } else {
+        spapr_add_nvdimm(dev, addr, size, node, &local_err);
+    }
+
     if (local_err) {
         goto out_unplug;
     }
@@ -3311,6 +3478,7 @@  static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
 {
     const sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
     sPAPRMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
+    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
     PCDIMMDevice *dimm = PC_DIMM(dev);
     Error *local_err = NULL;
     uint64_t size;
@@ -3328,10 +3496,30 @@  static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
         return;
     }
 
-    if (size % SPAPR_MEMORY_BLOCK_SIZE) {
+    if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) {
         error_setg(errp, "Hotplugged memory size must be a multiple of "
-                      "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
+                          "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
         return;
+    } else if (is_nvdimm) {
+        NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev));
+        if ((nvdimm->label_size + size) % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
+            error_setg(errp, "NVDIMM memory size must be a multiple of "
+                       "%" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
+            return;
+        }
+        if (((nvdimm->label_size + size) / SPAPR_MINIMUM_SCM_BLOCK_SIZE) == 1) {
+            error_setg(errp, "NVDIMM size must be atleast "
+                       "%" PRIu64 "MB", 2 * SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
+            return;
+        }
+
+        /* Align to scm block size, exclude the label */
+        memory_device_set_region_size(MEMORY_DEVICE(nvdimm),
+               QEMU_ALIGN_DOWN(size, SPAPR_MINIMUM_SCM_BLOCK_SIZE), &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
     }
 
     memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
index 2edb7d1e9c..94ddd102cc 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -696,6 +696,16 @@  static void spapr_drc_lmb_class_init(ObjectClass *k, void *data)
     drck->release = spapr_lmb_release;
 }
 
+static void spapr_drc_pmem_class_init(ObjectClass *k, void *data)
+{
+    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_CLASS(k);
+
+    drck->typeshift = SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM;
+    drck->typename = "MEM";
+    drck->drc_name_prefix = "PMEM ";
+    drck->release = NULL;
+}
+
 static const TypeInfo spapr_dr_connector_info = {
     .name          = TYPE_SPAPR_DR_CONNECTOR,
     .parent        = TYPE_DEVICE,
@@ -739,6 +749,12 @@  static const TypeInfo spapr_drc_lmb_info = {
     .class_init    = spapr_drc_lmb_class_init,
 };
 
+static const TypeInfo spapr_drc_pmem_info = {
+    .name          = TYPE_SPAPR_DRC_PMEM,
+    .parent        = TYPE_SPAPR_DRC_LOGICAL,
+    .class_init    = spapr_drc_pmem_class_init,
+};
+
 /* helper functions for external users */
 
 sPAPRDRConnector *spapr_drc_by_index(uint32_t index)
@@ -1189,6 +1205,7 @@  static void spapr_drc_register_types(void)
     type_register_static(&spapr_drc_cpu_info);
     type_register_static(&spapr_drc_pci_info);
     type_register_static(&spapr_drc_lmb_info);
+    type_register_static(&spapr_drc_pmem_info);
 
     spapr_rtas_register(RTAS_SET_INDICATOR, "set-indicator",
                         rtas_set_indicator);
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index 32719a1b72..a4fed84346 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -193,6 +193,7 @@  struct rtas_event_log_v6_hp {
 #define RTAS_LOG_V6_HP_TYPE_SLOT                         3
 #define RTAS_LOG_V6_HP_TYPE_PHB                          4
 #define RTAS_LOG_V6_HP_TYPE_PCI                          5
+#define RTAS_LOG_V6_HP_TYPE_PMEM                         6
     uint8_t hotplug_action;
 #define RTAS_LOG_V6_HP_ACTION_ADD                        1
 #define RTAS_LOG_V6_HP_ACTION_REMOVE                     2
@@ -526,6 +527,9 @@  static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
     case SPAPR_DR_CONNECTOR_TYPE_CPU:
         hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_CPU;
         break;
+    case SPAPR_DR_CONNECTOR_TYPE_PMEM:
+        hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PMEM;
+        break;
     default:
         /* we shouldn't be signaling hotplug events for resources
          * that don't support them
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index a947a0a0dc..21a9709afe 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -187,6 +187,7 @@  struct sPAPRMachineState {
 
     bool cmd_line_caps[SPAPR_CAP_NUM];
     sPAPRCapabilities def, eff, mig;
+    bool nvdimm_enabled;
 };
 
 #define H_SUCCESS         0
@@ -798,6 +799,15 @@  int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
 #define SPAPR_LMB_FLAGS_DRC_INVALID 0x00000020
 #define SPAPR_LMB_FLAGS_RESERVED 0x00000080
 
+/*
+ * The nvdimm size should be aligned to SCM block size.
+ * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
+ * inorder to have SCM regions not to overlap with dimm memory regions.
+ * The SCM devices can have variable block sizes. For now, fixing the
+ * block size to the minimum value.
+ */
+#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
+
 void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg);
 
 #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
index f6ff32e7e2..65925d00b1 100644
--- a/include/hw/ppc/spapr_drc.h
+++ b/include/hw/ppc/spapr_drc.h
@@ -70,6 +70,13 @@ 
 #define SPAPR_DRC_LMB(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
                                         TYPE_SPAPR_DRC_LMB)
 
+#define TYPE_SPAPR_DRC_PMEM "spapr-drc-pmem"
+#define SPAPR_DRC_PMEM_GET_CLASS(obj) \
+        OBJECT_GET_CLASS(sPAPRDRConnectorClass, obj, TYPE_SPAPR_DRC_PMEM)
+#define SPAPR_DRC_PMEM_CLASS(klass) \
+        OBJECT_CLASS_CHECK(sPAPRDRConnectorClass, klass, TYPE_SPAPR_DRC_PMEM)
+#define SPAPR_DRC_PMEM(obj) OBJECT_CHECK(sPAPRDRConnector, (obj), \
+                                        TYPE_SPAPR_DRC_PMEM)
 /*
  * Various hotplug types managed by sPAPRDRConnector
  *
@@ -87,6 +94,7 @@  typedef enum {
     SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO = 3,
     SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI = 4,
     SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB = 8,
+    SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM = 9,
 } sPAPRDRConnectorTypeShift;
 
 typedef enum {
@@ -96,6 +104,7 @@  typedef enum {
     SPAPR_DR_CONNECTOR_TYPE_VIO = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_VIO,
     SPAPR_DR_CONNECTOR_TYPE_PCI = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PCI,
     SPAPR_DR_CONNECTOR_TYPE_LMB = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_LMB,
+    SPAPR_DR_CONNECTOR_TYPE_PMEM = 1 << SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM,
 } sPAPRDRConnectorType;
 
 /*