diff mbox series

[RFC,v2,3/3] spapr: Add Hcalls to support PAPRNVDIMM device

Message ID 155773968985.49142.1164691973469833295.stgit@lep8c.aus.stglabs.ibm.com
State New
Headers show
Series ppc: spapr: virtual NVDIMM support | expand

Commit Message

Shivaprasad G Bhat May 13, 2019, 9:28 a.m. UTC
This patch implements few of the necessary hcalls for the nvdimm support.

PAPR semantics is such that each NVDIMM device is comprising of multiple
SCM(Storage Class Memory) blocks. The guest requests the hypervisor to bind
each of the SCM blocks of the NVDIMM device using hcalls. There can be
SCM block unbind requests in case of driver errors or unplug(not supported now)
use cases. The NVDIMM label read/writes are done through hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks, the bind,
unbind, and queries using hcalls on those blocks can come independently. This
doesn't fit well into the qemu device semantics, where the map/unmap are done
at the (whole)device/object level granularity. The patch doesnt actually
bind/unbind on hcalls but let it happen at the object_add/del phase itself
instead.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device at the
region level granularity. Without interleaving, each virtual NVDIMM device is
presented as separate region. There is no way to configure the virtual NVDIMM
interleaving for the guests today. So, there is no way a partial bind/unbind
request can come for the vNVDIMM in a hcall for a subset of SCM blocks of a
virtual NVDIMM. Hence it is safe to do bind/unbind everything during the
object_add/del.

Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
---
 hw/ppc/spapr_hcall.c   |  202 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/ppc/spapr.h |    7 +-
 2 files changed, 208 insertions(+), 1 deletion(-)

Comments

David Gibson May 14, 2019, 4:38 a.m. UTC | #1
On Mon, May 13, 2019 at 04:28:36AM -0500, Shivaprasad G Bhat wrote:
> This patch implements few of the necessary hcalls for the nvdimm support.
> 
> PAPR semantics is such that each NVDIMM device is comprising of multiple
> SCM(Storage Class Memory) blocks. The guest requests the hypervisor to bind
> each of the SCM blocks of the NVDIMM device using hcalls. There can be
> SCM block unbind requests in case of driver errors or unplug(not supported now)
> use cases. The NVDIMM label read/writes are done through hcalls.
> 
> Since each virtual NVDIMM device is divided into multiple SCM blocks, the bind,
> unbind, and queries using hcalls on those blocks can come independently. This
> doesn't fit well into the qemu device semantics, where the map/unmap are done
> at the (whole)device/object level granularity. The patch doesnt actually
> bind/unbind on hcalls but let it happen at the object_add/del phase itself
> instead.
> 
> The guest kernel makes bind/unbind requests for the virtual NVDIMM device at the
> region level granularity. Without interleaving, each virtual NVDIMM device is
> presented as separate region. There is no way to configure the virtual NVDIMM
> interleaving for the guests today. So, there is no way a partial bind/unbind
> request can come for the vNVDIMM in a hcall for a subset of SCM blocks of a
> virtual NVDIMM. Hence it is safe to do bind/unbind everything during the
> object_add/del.
> 
> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
> ---
>  hw/ppc/spapr_hcall.c   |  202 ++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr.h |    7 +-
>  2 files changed, 208 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 6c16d2b120..b6e7d04dcf 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -3,11 +3,13 @@
>  #include "sysemu/hw_accel.h"
>  #include "sysemu/sysemu.h"
>  #include "qemu/log.h"
> +#include "qemu/range.h"
>  #include "qemu/error-report.h"
>  #include "cpu.h"
>  #include "exec/exec-all.h"
>  #include "helper_regs.h"
>  #include "hw/ppc/spapr.h"
> +#include "hw/ppc/spapr_drc.h"
>  #include "hw/ppc/spapr_cpu_core.h"
>  #include "mmu-hash64.h"
>  #include "cpu-models.h"
> @@ -16,6 +18,7 @@
>  #include "hw/ppc/spapr_ovec.h"
>  #include "mmu-book3s-v3.h"
>  #include "hw/mem/memory-device.h"
> +#include "hw/mem/nvdimm.h"
>  
>  static bool has_spr(PowerPCCPU *cpu, int spr)
>  {
> @@ -1795,6 +1798,199 @@ static target_ulong h_update_dt(PowerPCCPU *cpu, SpaprMachineState *spapr,
>      return H_SUCCESS;
>  }
>  
> +static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
> +                                        SpaprMachineState *spapr,
> +                                        target_ulong opcode,
> +                                        target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t offset = args[1];
> +    uint64_t numBytesToRead = args[2];
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    NVDIMMDevice *nvdimm = NULL;
> +    NVDIMMClass *ddc = NULL;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (numBytesToRead != 1 && numBytesToRead != 2 &&
> +        numBytesToRead != 4 && numBytesToRead != 8) {
> +        return H_P3;
> +    }
> +
> +    nvdimm = NVDIMM(drc->dev);
> +    if ((offset + numBytesToRead < offset) ||
> +        (nvdimm->label_size < numBytesToRead + offset)) {
> +        return H_P2;
> +    }
> +
> +    ddc = NVDIMM_GET_CLASS(nvdimm);
> +    ddc->read_label_data(nvdimm, &args[0], numBytesToRead, offset);

I'm pretty sure this will need some sort of byteswap.  args[0] is
effectively in host native order (since it maps to a register, not
memory).  But the guest will have to assume some byteorder (BE, I'm
guessing, because PAPR) in order to map that register to an in-memory
byteorder.  So, you'll need to compensate for that here.

> +    return H_SUCCESS;
> +}
> +
> +
> +static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
> +                                         SpaprMachineState *spapr,
> +                                         target_ulong opcode,
> +                                         target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t offset = args[1];
> +    uint64_t data = args[2];
> +    int8_t numBytesToWrite = args[3];
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    NVDIMMDevice *nvdimm = NULL;
> +    DeviceState *dev = NULL;
> +    NVDIMMClass *ddc = NULL;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (numBytesToWrite != 1 && numBytesToWrite != 2 &&
> +        numBytesToWrite != 4 && numBytesToWrite != 8) {
> +        return H_P4;
> +    }
> +
> +    dev = drc->dev;
> +    nvdimm = NVDIMM(dev);
> +    if ((nvdimm->label_size < numBytesToWrite + offset) ||
> +        (offset + numBytesToWrite < offset)) {
> +        return H_P2;
> +    }
> +
> +    ddc = NVDIMM_GET_CLASS(nvdimm);
> +    ddc->write_label_data(nvdimm, &data, numBytesToWrite, offset);

Likewise here.

> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                        target_ulong opcode,
> +                                        target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t starting_idx = args[1];
> +    uint64_t no_of_scm_blocks_to_bind = args[2];
> +    uint64_t target_logical_mem_addr = args[3];
> +    uint64_t continue_token = args[4];
> +    uint64_t size;
> +    uint64_t total_no_of_scm_blocks;
> +
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    hwaddr addr;
> +    DeviceState *dev = NULL;
> +    PCDIMMDevice *dimm = NULL;
> +    Error *local_err = NULL;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    dev = drc->dev;
> +    dimm = PC_DIMM(dev);
> +
> +    size = object_property_get_uint(OBJECT(dimm),
> +                                    PC_DIMM_SIZE_PROP, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return H_PARAMETER;
> +    }
> +
> +    total_no_of_scm_blocks = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> +
> +    if ((starting_idx > total_no_of_scm_blocks) ||
> +        (no_of_scm_blocks_to_bind > total_no_of_scm_blocks)) {
> +        return H_P2;
> +    }
> +
> +    if (((starting_idx + no_of_scm_blocks_to_bind) < starting_idx) ||
> +        ((starting_idx + no_of_scm_blocks_to_bind) > total_no_of_scm_blocks)) {
> +        return H_P3;
> +    }
> +
> +    /* Currently qemu assigns the address. */
> +    if (target_logical_mem_addr != 0xffffffffffffffff) {
> +        return H_OVERLAP;

So, only supporting one mode of the interface is reasonable.  However,
it seems a somewhat unnatural way to handle the PAPR interface.  It
seems to me it would be more natural to instantiate the underlying
NVDIMM objects so that they're not in address_space_memory, but rather
in their own initially inaccssible address_space.  Then the BIND call
would alias a chunk of address_space_memory into the NVDIMMs address
space.

> +    }
> +
> +    /*
> +     * Currently continue token should be zero qemu has already bound
> +     * everything and this hcall doesnt return H_BUSY.
> +     */
> +    if (continue_token > 0) {
> +        return H_P5;
> +    }
> +
> +    /* NB : Already bound, Return target logical address in R4 */
> +    addr = object_property_get_uint(OBJECT(dimm),
> +                                    PC_DIMM_ADDR_PROP, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return H_PARAMETER;
> +    }
> +
> +    args[1] = addr;

Don't you need to adjust this if start_idx is non-zero?

> +    args[2] = no_of_scm_blocks_to_bind;
> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                        target_ulong opcode,
> +                                        target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t starting_scm_logical_addr = args[1];
> +    uint64_t no_of_scm_blocks_to_unbind = args[2];
> +    uint64_t size_to_unbind;
> +    uint64_t continue_token = args[3];
> +    Range blockrange = range_empty;
> +    Range nvdimmrange = range_empty;
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    DeviceState *dev = NULL;
> +    PCDIMMDevice *dimm = NULL;
> +    uint64_t size, addr;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    /* Check if starting_scm_logical_addr is block aligned */
> +    if (!QEMU_IS_ALIGNED(starting_scm_logical_addr,
> +                         SPAPR_MINIMUM_SCM_BLOCK_SIZE)) {
> +        return H_P2;
> +    }
> +
> +    dev = drc->dev;
> +    dimm = PC_DIMM(dev);
> +    size = object_property_get_int(OBJECT(dimm), PC_DIMM_SIZE_PROP, NULL);
> +    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, NULL);
> +
> +    range_init_nofail(&nvdimmrange, addr, size);
> +
> +    size_to_unbind = no_of_scm_blocks_to_unbind * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> +
> +
> +    range_init_nofail(&blockrange, starting_scm_logical_addr, size_to_unbind);
> +
> +    if (!range_contains_range(&nvdimmrange, &blockrange)) {
> +        return H_P3;
> +    }
> +
> +    if (continue_token > 0) {
> +        return H_P3;
> +    }
> +
> +    args[1] = no_of_scm_blocks_to_unbind;
> +
> +    /*NB : dont do anything, let object_del take care of this for now. */
> +    return H_SUCCESS;
> +}
> +
>  static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
>  static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX - KVMPPC_HCALL_BASE + 1];
>  
> @@ -1894,6 +2090,12 @@ static void hypercall_register_types(void)
>      /* qemu/KVM-PPC specific hcalls */
>      spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas);
>  
> +    /* qemu/scm specific hcalls */
> +    spapr_register_hypercall(H_SCM_READ_METADATA, h_scm_read_metadata);
> +    spapr_register_hypercall(H_SCM_WRITE_METADATA, h_scm_write_metadata);
> +    spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
> +    spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
> +
>      /* ibm,client-architecture-support support */
>      spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
>  
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 394ea26335..48e2cc9d67 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -283,6 +283,7 @@ struct SpaprMachineState {
>  #define H_P7              -60
>  #define H_P8              -61
>  #define H_P9              -62
> +#define H_OVERLAP         -68
>  #define H_UNSUPPORTED_FLAG -256
>  #define H_MULTI_THREADS_ACTIVE -9005
>  
> @@ -490,8 +491,12 @@ struct SpaprMachineState {
>  #define H_INT_ESB               0x3C8
>  #define H_INT_SYNC              0x3CC
>  #define H_INT_RESET             0x3D0
> +#define H_SCM_READ_METADATA     0x3E4
> +#define H_SCM_WRITE_METADATA    0x3E8
> +#define H_SCM_BIND_MEM          0x3EC
> +#define H_SCM_UNBIND_MEM        0x3F0
>  
> -#define MAX_HCALL_OPCODE        H_INT_RESET
> +#define MAX_HCALL_OPCODE        H_SCM_UNBIND_MEM
>  
>  /* The hcalls above are standardized in PAPR and implemented by pHyp
>   * as well.
>
Fabiano Rosas May 22, 2019, 6:08 p.m. UTC | #2
Shivaprasad G Bhat <sbhat@linux.ibm.com> writes:

> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 6c16d2b120..b6e7d04dcf 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -3,11 +3,13 @@
>  #include "sysemu/hw_accel.h"
>  #include "sysemu/sysemu.h"
>  #include "qemu/log.h"
> +#include "qemu/range.h"
>  #include "qemu/error-report.h"
>  #include "cpu.h"
>  #include "exec/exec-all.h"
>  #include "helper_regs.h"
>  #include "hw/ppc/spapr.h"
> +#include "hw/ppc/spapr_drc.h"
>  #include "hw/ppc/spapr_cpu_core.h"
>  #include "mmu-hash64.h"
>  #include "cpu-models.h"
> @@ -16,6 +18,7 @@
>  #include "hw/ppc/spapr_ovec.h"
>  #include "mmu-book3s-v3.h"
>  #include "hw/mem/memory-device.h"
> +#include "hw/mem/nvdimm.h"
>  
>  static bool has_spr(PowerPCCPU *cpu, int spr)
>  {
> @@ -1795,6 +1798,199 @@ static target_ulong h_update_dt(PowerPCCPU *cpu, SpaprMachineState *spapr,
>      return H_SUCCESS;
>  }
>  
> +static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
> +                                        SpaprMachineState *spapr,
> +                                        target_ulong opcode,
> +                                        target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t offset = args[1];
> +    uint64_t numBytesToRead = args[2];

This variable's case is inconsistent with the rest of the file.

> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    NVDIMMDevice *nvdimm = NULL;
> +    NVDIMMClass *ddc = NULL;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (numBytesToRead != 1 && numBytesToRead != 2 &&
> +        numBytesToRead != 4 && numBytesToRead != 8) {
> +        return H_P3;
> +    }
> +
> +    nvdimm = NVDIMM(drc->dev);
> +    if ((offset + numBytesToRead < offset) ||
> +        (nvdimm->label_size < numBytesToRead + offset)) {
> +        return H_P2;
> +    }

Won't the first clause always be false? Considering they're both uint64_t.

> +
> +    ddc = NVDIMM_GET_CLASS(nvdimm);
> +    ddc->read_label_data(nvdimm, &args[0], numBytesToRead, offset);
> +
> +    return H_SUCCESS;
> +}
> +
> +
> +static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
> +                                         SpaprMachineState *spapr,
> +                                         target_ulong opcode,
> +                                         target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t offset = args[1];
> +    uint64_t data = args[2];
> +    int8_t numBytesToWrite = args[3];

Likewise.

> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    NVDIMMDevice *nvdimm = NULL;
> +    DeviceState *dev = NULL;
> +    NVDIMMClass *ddc = NULL;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (numBytesToWrite != 1 && numBytesToWrite != 2 &&
> +        numBytesToWrite != 4 && numBytesToWrite != 8) {
> +        return H_P4;
> +    }
> +
> +    dev = drc->dev;
> +    nvdimm = NVDIMM(dev);
> +    if ((nvdimm->label_size < numBytesToWrite + offset) ||
> +        (offset + numBytesToWrite < offset)) {
> +        return H_P2;
> +    }
> +
> +    ddc = NVDIMM_GET_CLASS(nvdimm);
> +    ddc->write_label_data(nvdimm, &data, numBytesToWrite, offset);
> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                        target_ulong opcode,
> +                                        target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t starting_idx = args[1];
> +    uint64_t no_of_scm_blocks_to_bind = args[2];
> +    uint64_t target_logical_mem_addr = args[3];
> +    uint64_t continue_token = args[4];
> +    uint64_t size;
> +    uint64_t total_no_of_scm_blocks;
> +
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    hwaddr addr;
> +    DeviceState *dev = NULL;
> +    PCDIMMDevice *dimm = NULL;
> +    Error *local_err = NULL;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    dev = drc->dev;
> +    dimm = PC_DIMM(dev);
> +
> +    size = object_property_get_uint(OBJECT(dimm),
> +                                    PC_DIMM_SIZE_PROP, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return H_PARAMETER;
> +    }
> +
> +    total_no_of_scm_blocks = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> +
> +    if ((starting_idx > total_no_of_scm_blocks) ||
> +        (no_of_scm_blocks_to_bind > total_no_of_scm_blocks)) {
> +        return H_P2;
> +    }
> +
> +    if (((starting_idx + no_of_scm_blocks_to_bind) < starting_idx) ||
> +        ((starting_idx + no_of_scm_blocks_to_bind) > total_no_of_scm_blocks)) {
> +        return H_P3;
> +    }

Same here.

> +
> +    /* Currently qemu assigns the address. */
> +    if (target_logical_mem_addr != 0xffffffffffffffff) {
> +        return H_OVERLAP;
> +    }
> +
> +    /*
> +     * Currently continue token should be zero qemu has already bound
> +     * everything and this hcall doesnt return H_BUSY.
> +     */
> +    if (continue_token > 0) {
> +        return H_P5;
> +    }
> +
> +    /* NB : Already bound, Return target logical address in R4 */
> +    addr = object_property_get_uint(OBJECT(dimm),
> +                                    PC_DIMM_ADDR_PROP, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return H_PARAMETER;
> +    }
> +
> +    args[1] = addr;
> +    args[2] = no_of_scm_blocks_to_bind;
> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                        target_ulong opcode,
> +                                        target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t starting_scm_logical_addr = args[1];
> +    uint64_t no_of_scm_blocks_to_unbind = args[2];
> +    uint64_t size_to_unbind;
> +    uint64_t continue_token = args[3];
> +    Range blockrange = range_empty;
> +    Range nvdimmrange = range_empty;
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    DeviceState *dev = NULL;
> +    PCDIMMDevice *dimm = NULL;
> +    uint64_t size, addr;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    /* Check if starting_scm_logical_addr is block aligned */
> +    if (!QEMU_IS_ALIGNED(starting_scm_logical_addr,
> +                         SPAPR_MINIMUM_SCM_BLOCK_SIZE)) {
> +        return H_P2;
> +    }
> +
> +    dev = drc->dev;
> +    dimm = PC_DIMM(dev);
> +    size = object_property_get_int(OBJECT(dimm), PC_DIMM_SIZE_PROP, NULL);
> +    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, NULL);
> +
> +    range_init_nofail(&nvdimmrange, addr, size);
> +
> +    size_to_unbind = no_of_scm_blocks_to_unbind * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> +
> +
> +    range_init_nofail(&blockrange, starting_scm_logical_addr, size_to_unbind);
> +
> +    if (!range_contains_range(&nvdimmrange, &blockrange)) {
> +        return H_P3;
> +    }
> +
> +    if (continue_token > 0) {
> +        return H_P3;
> +    }
> +
> +    args[1] = no_of_scm_blocks_to_unbind;
> +
> +    /*NB : dont do anything, let object_del take care of this for now. */
> +    return H_SUCCESS;
> +}
> +
>  static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
>  static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX - KVMPPC_HCALL_BASE + 1];
>  
> @@ -1894,6 +2090,12 @@ static void hypercall_register_types(void)
>      /* qemu/KVM-PPC specific hcalls */
>      spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas);
>  
> +    /* qemu/scm specific hcalls */
> +    spapr_register_hypercall(H_SCM_READ_METADATA, h_scm_read_metadata);
> +    spapr_register_hypercall(H_SCM_WRITE_METADATA, h_scm_write_metadata);
> +    spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
> +    spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
> +
>      /* ibm,client-architecture-support support */
>      spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
>  
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 394ea26335..48e2cc9d67 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -283,6 +283,7 @@ struct SpaprMachineState {
>  #define H_P7              -60
>  #define H_P8              -61
>  #define H_P9              -62
> +#define H_OVERLAP         -68
>  #define H_UNSUPPORTED_FLAG -256
>  #define H_MULTI_THREADS_ACTIVE -9005
>  
> @@ -490,8 +491,12 @@ struct SpaprMachineState {
>  #define H_INT_ESB               0x3C8
>  #define H_INT_SYNC              0x3CC
>  #define H_INT_RESET             0x3D0
> +#define H_SCM_READ_METADATA     0x3E4
> +#define H_SCM_WRITE_METADATA    0x3E8
> +#define H_SCM_BIND_MEM          0x3EC
> +#define H_SCM_UNBIND_MEM        0x3F0
>  
> -#define MAX_HCALL_OPCODE        H_INT_RESET
> +#define MAX_HCALL_OPCODE        H_SCM_UNBIND_MEM
>  
>  /* The hcalls above are standardized in PAPR and implemented by pHyp
>   * as well.
Fabiano Rosas May 22, 2019, 6:11 p.m. UTC | #3
Fabiano Rosas <farosas@linux.ibm.com> writes:

>> +    nvdimm = NVDIMM(drc->dev);
>> +    if ((offset + numBytesToRead < offset) ||
>> +        (nvdimm->label_size < numBytesToRead + offset)) {
>> +        return H_P2;
>> +    }
>
> Won't the first clause always be false? Considering they're both uint64_t.

Neverming this question. I just saw that David asked for it in the previous
version.
David Gibson May 23, 2019, 12:46 a.m. UTC | #4
On Wed, May 22, 2019 at 03:08:34PM -0300, Fabiano Rosas wrote:
> Shivaprasad G Bhat <sbhat@linux.ibm.com> writes:
> 
> > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> > index 6c16d2b120..b6e7d04dcf 100644
> > --- a/hw/ppc/spapr_hcall.c
> > +++ b/hw/ppc/spapr_hcall.c
> > @@ -3,11 +3,13 @@
> >  #include "sysemu/hw_accel.h"
> >  #include "sysemu/sysemu.h"
> >  #include "qemu/log.h"
> > +#include "qemu/range.h"
> >  #include "qemu/error-report.h"
> >  #include "cpu.h"
> >  #include "exec/exec-all.h"
> >  #include "helper_regs.h"
> >  #include "hw/ppc/spapr.h"
> > +#include "hw/ppc/spapr_drc.h"
> >  #include "hw/ppc/spapr_cpu_core.h"
> >  #include "mmu-hash64.h"
> >  #include "cpu-models.h"
> > @@ -16,6 +18,7 @@
> >  #include "hw/ppc/spapr_ovec.h"
> >  #include "mmu-book3s-v3.h"
> >  #include "hw/mem/memory-device.h"
> > +#include "hw/mem/nvdimm.h"
> >  
> >  static bool has_spr(PowerPCCPU *cpu, int spr)
> >  {
> > @@ -1795,6 +1798,199 @@ static target_ulong h_update_dt(PowerPCCPU *cpu, SpaprMachineState *spapr,
> >      return H_SUCCESS;
> >  }
> >  
> > +static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
> > +                                        SpaprMachineState *spapr,
> > +                                        target_ulong opcode,
> > +                                        target_ulong *args)
> > +{
> > +    uint32_t drc_index = args[0];
> > +    uint64_t offset = args[1];
> > +    uint64_t numBytesToRead = args[2];
> 
> This variable's case is inconsistent with the rest of the file.
> 
> > +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> > +    NVDIMMDevice *nvdimm = NULL;
> > +    NVDIMMClass *ddc = NULL;
> > +
> > +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> > +        return H_PARAMETER;
> > +    }
> > +
> > +    if (numBytesToRead != 1 && numBytesToRead != 2 &&
> > +        numBytesToRead != 4 && numBytesToRead != 8) {
> > +        return H_P3;
> > +    }
> > +
> > +    nvdimm = NVDIMM(drc->dev);
> > +    if ((offset + numBytesToRead < offset) ||
> > +        (nvdimm->label_size < numBytesToRead + offset)) {
> > +        return H_P2;
> > +    }
> 
> Won't the first clause always be false? Considering they're both
> uint64_t.

Generally yes, but that's caleed supplied input, so we need to protect
against 64-bit overflow.

> 
> > +
> > +    ddc = NVDIMM_GET_CLASS(nvdimm);
> > +    ddc->read_label_data(nvdimm, &args[0], numBytesToRead, offset);
> > +
> > +    return H_SUCCESS;
> > +}
> > +
> > +
> > +static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
> > +                                         SpaprMachineState *spapr,
> > +                                         target_ulong opcode,
> > +                                         target_ulong *args)
> > +{
> > +    uint32_t drc_index = args[0];
> > +    uint64_t offset = args[1];
> > +    uint64_t data = args[2];
> > +    int8_t numBytesToWrite = args[3];
> 
> Likewise.
> 
> > +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> > +    NVDIMMDevice *nvdimm = NULL;
> > +    DeviceState *dev = NULL;
> > +    NVDIMMClass *ddc = NULL;
> > +
> > +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> > +        return H_PARAMETER;
> > +    }
> > +
> > +    if (numBytesToWrite != 1 && numBytesToWrite != 2 &&
> > +        numBytesToWrite != 4 && numBytesToWrite != 8) {
> > +        return H_P4;
> > +    }
> > +
> > +    dev = drc->dev;
> > +    nvdimm = NVDIMM(dev);
> > +    if ((nvdimm->label_size < numBytesToWrite + offset) ||
> > +        (offset + numBytesToWrite < offset)) {
> > +        return H_P2;
> > +    }
> > +
> > +    ddc = NVDIMM_GET_CLASS(nvdimm);
> > +    ddc->write_label_data(nvdimm, &data, numBytesToWrite, offset);
> > +
> > +    return H_SUCCESS;
> > +}
> > +
> > +static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
> > +                                        target_ulong opcode,
> > +                                        target_ulong *args)
> > +{
> > +    uint32_t drc_index = args[0];
> > +    uint64_t starting_idx = args[1];
> > +    uint64_t no_of_scm_blocks_to_bind = args[2];
> > +    uint64_t target_logical_mem_addr = args[3];
> > +    uint64_t continue_token = args[4];
> > +    uint64_t size;
> > +    uint64_t total_no_of_scm_blocks;
> > +
> > +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> > +    hwaddr addr;
> > +    DeviceState *dev = NULL;
> > +    PCDIMMDevice *dimm = NULL;
> > +    Error *local_err = NULL;
> > +
> > +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> > +        return H_PARAMETER;
> > +    }
> > +
> > +    dev = drc->dev;
> > +    dimm = PC_DIMM(dev);
> > +
> > +    size = object_property_get_uint(OBJECT(dimm),
> > +                                    PC_DIMM_SIZE_PROP, &local_err);
> > +    if (local_err) {
> > +        error_report_err(local_err);
> > +        return H_PARAMETER;
> > +    }
> > +
> > +    total_no_of_scm_blocks = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> > +
> > +    if ((starting_idx > total_no_of_scm_blocks) ||
> > +        (no_of_scm_blocks_to_bind > total_no_of_scm_blocks)) {
> > +        return H_P2;
> > +    }
> > +
> > +    if (((starting_idx + no_of_scm_blocks_to_bind) < starting_idx) ||
> > +        ((starting_idx + no_of_scm_blocks_to_bind) > total_no_of_scm_blocks)) {
> > +        return H_P3;
> > +    }
> 
> Same here.
> 
> > +
> > +    /* Currently qemu assigns the address. */
> > +    if (target_logical_mem_addr != 0xffffffffffffffff) {
> > +        return H_OVERLAP;
> > +    }
> > +
> > +    /*
> > +     * Currently continue token should be zero qemu has already bound
> > +     * everything and this hcall doesnt return H_BUSY.
> > +     */
> > +    if (continue_token > 0) {
> > +        return H_P5;
> > +    }
> > +
> > +    /* NB : Already bound, Return target logical address in R4 */
> > +    addr = object_property_get_uint(OBJECT(dimm),
> > +                                    PC_DIMM_ADDR_PROP, &local_err);
> > +    if (local_err) {
> > +        error_report_err(local_err);
> > +        return H_PARAMETER;
> > +    }
> > +
> > +    args[1] = addr;
> > +    args[2] = no_of_scm_blocks_to_bind;
> > +
> > +    return H_SUCCESS;
> > +}
> > +
> > +static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
> > +                                        target_ulong opcode,
> > +                                        target_ulong *args)
> > +{
> > +    uint32_t drc_index = args[0];
> > +    uint64_t starting_scm_logical_addr = args[1];
> > +    uint64_t no_of_scm_blocks_to_unbind = args[2];
> > +    uint64_t size_to_unbind;
> > +    uint64_t continue_token = args[3];
> > +    Range blockrange = range_empty;
> > +    Range nvdimmrange = range_empty;
> > +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> > +    DeviceState *dev = NULL;
> > +    PCDIMMDevice *dimm = NULL;
> > +    uint64_t size, addr;
> > +
> > +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> > +        return H_PARAMETER;
> > +    }
> > +
> > +    /* Check if starting_scm_logical_addr is block aligned */
> > +    if (!QEMU_IS_ALIGNED(starting_scm_logical_addr,
> > +                         SPAPR_MINIMUM_SCM_BLOCK_SIZE)) {
> > +        return H_P2;
> > +    }
> > +
> > +    dev = drc->dev;
> > +    dimm = PC_DIMM(dev);
> > +    size = object_property_get_int(OBJECT(dimm), PC_DIMM_SIZE_PROP, NULL);
> > +    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, NULL);
> > +
> > +    range_init_nofail(&nvdimmrange, addr, size);
> > +
> > +    size_to_unbind = no_of_scm_blocks_to_unbind * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> > +
> > +
> > +    range_init_nofail(&blockrange, starting_scm_logical_addr, size_to_unbind);
> > +
> > +    if (!range_contains_range(&nvdimmrange, &blockrange)) {
> > +        return H_P3;
> > +    }
> > +
> > +    if (continue_token > 0) {
> > +        return H_P3;
> > +    }
> > +
> > +    args[1] = no_of_scm_blocks_to_unbind;
> > +
> > +    /*NB : dont do anything, let object_del take care of this for now. */
> > +    return H_SUCCESS;
> > +}
> > +
> >  static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
> >  static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX - KVMPPC_HCALL_BASE + 1];
> >  
> > @@ -1894,6 +2090,12 @@ static void hypercall_register_types(void)
> >      /* qemu/KVM-PPC specific hcalls */
> >      spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas);
> >  
> > +    /* qemu/scm specific hcalls */
> > +    spapr_register_hypercall(H_SCM_READ_METADATA, h_scm_read_metadata);
> > +    spapr_register_hypercall(H_SCM_WRITE_METADATA, h_scm_write_metadata);
> > +    spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
> > +    spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
> > +
> >      /* ibm,client-architecture-support support */
> >      spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
> >  
> > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > index 394ea26335..48e2cc9d67 100644
> > --- a/include/hw/ppc/spapr.h
> > +++ b/include/hw/ppc/spapr.h
> > @@ -283,6 +283,7 @@ struct SpaprMachineState {
> >  #define H_P7              -60
> >  #define H_P8              -61
> >  #define H_P9              -62
> > +#define H_OVERLAP         -68
> >  #define H_UNSUPPORTED_FLAG -256
> >  #define H_MULTI_THREADS_ACTIVE -9005
> >  
> > @@ -490,8 +491,12 @@ struct SpaprMachineState {
> >  #define H_INT_ESB               0x3C8
> >  #define H_INT_SYNC              0x3CC
> >  #define H_INT_RESET             0x3D0
> > +#define H_SCM_READ_METADATA     0x3E4
> > +#define H_SCM_WRITE_METADATA    0x3E8
> > +#define H_SCM_BIND_MEM          0x3EC
> > +#define H_SCM_UNBIND_MEM        0x3F0
> >  
> > -#define MAX_HCALL_OPCODE        H_INT_RESET
> > +#define MAX_HCALL_OPCODE        H_SCM_UNBIND_MEM
> >  
> >  /* The hcalls above are standardized in PAPR and implemented by pHyp
> >   * as well.
>
Shivaprasad G Bhat July 11, 2019, 5:46 a.m. UTC | #5
On 05/14/2019 10:08 AM, David Gibson wrote:
> On Mon, May 13, 2019 at 04:28:36AM -0500, Shivaprasad G Bhat wrote:
>> This patch implements few of the necessary hcalls for the nvdimm support.
>>
>> PAPR semantics is such that each NVDIMM device is comprising of multiple
>> SCM(Storage Class Memory) blocks. The guest requests the hypervisor to bind
>> each of the SCM blocks of the NVDIMM device using hcalls. There can be
>> SCM block unbind requests in case of driver errors or unplug(not supported now)
>> use cases. The NVDIMM label read/writes are done through hcalls.
>>
>> Since each virtual NVDIMM device is divided into multiple SCM blocks, the bind,
>> unbind, and queries using hcalls on those blocks can come independently. This
>> doesn't fit well into the qemu device semantics, where the map/unmap are done
>> at the (whole)device/object level granularity. The patch doesnt actually
>> bind/unbind on hcalls but let it happen at the object_add/del phase itself
>> instead.
>>
>> The guest kernel makes bind/unbind requests for the virtual NVDIMM device at the
>> region level granularity. Without interleaving, each virtual NVDIMM device is
>> presented as separate region. There is no way to configure the virtual NVDIMM
>> interleaving for the guests today. So, there is no way a partial bind/unbind
>> request can come for the vNVDIMM in a hcall for a subset of SCM blocks of a
>> virtual NVDIMM. Hence it is safe to do bind/unbind everything during the
>> object_add/del.
>>
>> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
>> ---
>>   hw/ppc/spapr_hcall.c   |  202 ++++++++++++++++++++++++++++++++++++++++++++++++
>>   include/hw/ppc/spapr.h |    7 +-
>>   2 files changed, 208 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
>> index 6c16d2b120..b6e7d04dcf 100644
>> --- a/hw/ppc/spapr_hcall.c
>> +++ b/hw/ppc/spapr_hcall.c
>> @@ -3,11 +3,13 @@
>>   #include "sysemu/hw_accel.h"
>>   #include "sysemu/sysemu.h"
>>   #include "qemu/log.h"
>> +#include "qemu/range.h"
>>   #include "qemu/error-report.h"
>>   #include "cpu.h"
>>   #include "exec/exec-all.h"
>>   #include "helper_regs.h"
>>   #include "hw/ppc/spapr.h"
>> +#include "hw/ppc/spapr_drc.h"
>>   #include "hw/ppc/spapr_cpu_core.h"
>>   #include "mmu-hash64.h"
>>   #include "cpu-models.h"
>> @@ -16,6 +18,7 @@
>>   #include "hw/ppc/spapr_ovec.h"
>>   #include "mmu-book3s-v3.h"
>>   #include "hw/mem/memory-device.h"
>> +#include "hw/mem/nvdimm.h"
>>   
>>   static bool has_spr(PowerPCCPU *cpu, int spr)
>>   {
>> @@ -1795,6 +1798,199 @@ static target_ulong h_update_dt(PowerPCCPU *cpu, SpaprMachineState *spapr,
>>       return H_SUCCESS;
>>   }
>>   
>> +static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
>> +                                        SpaprMachineState *spapr,
>> +                                        target_ulong opcode,
>> +                                        target_ulong *args)
>> +{
>> +    uint32_t drc_index = args[0];
>> +    uint64_t offset = args[1];
>> +    uint64_t numBytesToRead = args[2];
>> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
>> +    NVDIMMDevice *nvdimm = NULL;
>> +    NVDIMMClass *ddc = NULL;
>> +
>> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    if (numBytesToRead != 1 && numBytesToRead != 2 &&
>> +        numBytesToRead != 4 && numBytesToRead != 8) {
>> +        return H_P3;
>> +    }
>> +
>> +    nvdimm = NVDIMM(drc->dev);
>> +    if ((offset + numBytesToRead < offset) ||
>> +        (nvdimm->label_size < numBytesToRead + offset)) {
>> +        return H_P2;
>> +    }
>> +
>> +    ddc = NVDIMM_GET_CLASS(nvdimm);
>> +    ddc->read_label_data(nvdimm, &args[0], numBytesToRead, offset);
> I'm pretty sure this will need some sort of byteswap.  args[0] is
> effectively in host native order (since it maps to a register, not
> memory).  But the guest will have to assume some byteorder (BE, I'm
> guessing, because PAPR) in order to map that register to an in-memory
> byteorder.  So, you'll need to compensate for that here.

You are right, I have figured out the required changes working with the 
kernel
team. Will post it in the next version.

>
>> +    return H_SUCCESS;
>> +}
>> +
>> +
>> +static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
>> +                                         SpaprMachineState *spapr,
>> +                                         target_ulong opcode,
>> +                                         target_ulong *args)
>> +{
>> +    uint32_t drc_index = args[0];
>> +    uint64_t offset = args[1];
>> +    uint64_t data = args[2];
>> +    int8_t numBytesToWrite = args[3];
>> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
>> +    NVDIMMDevice *nvdimm = NULL;
>> +    DeviceState *dev = NULL;
>> +    NVDIMMClass *ddc = NULL;
>> +
>> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    if (numBytesToWrite != 1 && numBytesToWrite != 2 &&
>> +        numBytesToWrite != 4 && numBytesToWrite != 8) {
>> +        return H_P4;
>> +    }
>> +
>> +    dev = drc->dev;
>> +    nvdimm = NVDIMM(dev);
>> +    if ((nvdimm->label_size < numBytesToWrite + offset) ||
>> +        (offset + numBytesToWrite < offset)) {
>> +        return H_P2;
>> +    }
>> +
>> +    ddc = NVDIMM_GET_CLASS(nvdimm);
>> +    ddc->write_label_data(nvdimm, &data, numBytesToWrite, offset);
> Likewise here.
>
>> +
>> +    return H_SUCCESS;
>> +}
>> +
>> +static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
>> +                                        target_ulong opcode,
>> +                                        target_ulong *args)
>> +{
>> +    uint32_t drc_index = args[0];
>> +    uint64_t starting_idx = args[1];
>> +    uint64_t no_of_scm_blocks_to_bind = args[2];
>> +    uint64_t target_logical_mem_addr = args[3];
>> +    uint64_t continue_token = args[4];
>> +    uint64_t size;
>> +    uint64_t total_no_of_scm_blocks;
>> +
>> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
>> +    hwaddr addr;
>> +    DeviceState *dev = NULL;
>> +    PCDIMMDevice *dimm = NULL;
>> +    Error *local_err = NULL;
>> +
>> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    dev = drc->dev;
>> +    dimm = PC_DIMM(dev);
>> +
>> +    size = object_property_get_uint(OBJECT(dimm),
>> +                                    PC_DIMM_SIZE_PROP, &local_err);
>> +    if (local_err) {
>> +        error_report_err(local_err);
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    total_no_of_scm_blocks = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
>> +
>> +    if ((starting_idx > total_no_of_scm_blocks) ||
>> +        (no_of_scm_blocks_to_bind > total_no_of_scm_blocks)) {
>> +        return H_P2;
>> +    }
>> +
>> +    if (((starting_idx + no_of_scm_blocks_to_bind) < starting_idx) ||
>> +        ((starting_idx + no_of_scm_blocks_to_bind) > total_no_of_scm_blocks)) {
>> +        return H_P3;
>> +    }
>> +
>> +    /* Currently qemu assigns the address. */
>> +    if (target_logical_mem_addr != 0xffffffffffffffff) {
>> +        return H_OVERLAP;
> So, only supporting one mode of the interface is reasonable.  However,
> it seems a somewhat unnatural way to handle the PAPR interface.  It
> seems to me it would be more natural to instantiate the underlying
> NVDIMM objects so that they're not in address_space_memory, but rather
> in their own initially inaccssible address_space.  Then the BIND call
> would alias a chunk of address_space_memory into the NVDIMMs address
> space.

The pre-plug checks require the memory region to be initialized before 
reaching
there. So, I cant avoid the minimal initialization with the size calling
memory_region_init() at the nvdimm_prepare_memory_region(). If I delay
the aliasing till the bind hcall, things seem to work.

Are you suggesting me to do something like this?

https://github.ibm.com/shivapbh/qemu-1/commit/04e0a5c7ef71db942be5ced936fde93dd7bb3ee4

>> +    }
>> +
>> +    /*
>> +     * Currently continue token should be zero qemu has already bound
>> +     * everything and this hcall doesnt return H_BUSY.
>> +     */
>> +    if (continue_token > 0) {
>> +        return H_P5;
>> +    }
>> +
>> +    /* NB : Already bound, Return target logical address in R4 */
>> +    addr = object_property_get_uint(OBJECT(dimm),
>> +                                    PC_DIMM_ADDR_PROP, &local_err);
>> +    if (local_err) {
>> +        error_report_err(local_err);
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    args[1] = addr;
> Don't you need to adjust this if start_idx is non-zero?

Since I don't support partial bind request, start_idx can never be 
non-zero. I think
I need to enforce it with more checks here.

>
>> +    args[2] = no_of_scm_blocks_to_bind;
>> +
>> +    return H_SUCCESS;
>> +}
>> +
>> +static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
>> +                                        target_ulong opcode,
>> +                                        target_ulong *args)
>> +{
>> +    uint32_t drc_index = args[0];
>> +    uint64_t starting_scm_logical_addr = args[1];
>> +    uint64_t no_of_scm_blocks_to_unbind = args[2];
>> +    uint64_t size_to_unbind;
>> +    uint64_t continue_token = args[3];
>> +    Range blockrange = range_empty;
>> +    Range nvdimmrange = range_empty;
>> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
>> +    DeviceState *dev = NULL;
>> +    PCDIMMDevice *dimm = NULL;
>> +    uint64_t size, addr;
>> +
>> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
>> +        return H_PARAMETER;
>> +    }
>> +
>> +    /* Check if starting_scm_logical_addr is block aligned */
>> +    if (!QEMU_IS_ALIGNED(starting_scm_logical_addr,
>> +                         SPAPR_MINIMUM_SCM_BLOCK_SIZE)) {
>> +        return H_P2;
>> +    }
>> +
>> +    dev = drc->dev;
>> +    dimm = PC_DIMM(dev);
>> +    size = object_property_get_int(OBJECT(dimm), PC_DIMM_SIZE_PROP, NULL);
>> +    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, NULL);
>> +
>> +    range_init_nofail(&nvdimmrange, addr, size);
>> +
>> +    size_to_unbind = no_of_scm_blocks_to_unbind * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
>> +
>> +
>> +    range_init_nofail(&blockrange, starting_scm_logical_addr, size_to_unbind);
>> +
>> +    if (!range_contains_range(&nvdimmrange, &blockrange)) {
>> +        return H_P3;
>> +    }
>> +
>> +    if (continue_token > 0) {
>> +        return H_P3;
>> +    }
>> +
>> +    args[1] = no_of_scm_blocks_to_unbind;
>> +
>> +    /*NB : dont do anything, let object_del take care of this for now. */
>> +    return H_SUCCESS;
>> +}
>> +
>>   static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
>>   static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX - KVMPPC_HCALL_BASE + 1];
>>   
>> @@ -1894,6 +2090,12 @@ static void hypercall_register_types(void)
>>       /* qemu/KVM-PPC specific hcalls */
>>       spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas);
>>   
>> +    /* qemu/scm specific hcalls */
>> +    spapr_register_hypercall(H_SCM_READ_METADATA, h_scm_read_metadata);
>> +    spapr_register_hypercall(H_SCM_WRITE_METADATA, h_scm_write_metadata);
>> +    spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
>> +    spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
>> +
>>       /* ibm,client-architecture-support support */
>>       spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
>>   
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 394ea26335..48e2cc9d67 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -283,6 +283,7 @@ struct SpaprMachineState {
>>   #define H_P7              -60
>>   #define H_P8              -61
>>   #define H_P9              -62
>> +#define H_OVERLAP         -68
>>   #define H_UNSUPPORTED_FLAG -256
>>   #define H_MULTI_THREADS_ACTIVE -9005
>>   
>> @@ -490,8 +491,12 @@ struct SpaprMachineState {
>>   #define H_INT_ESB               0x3C8
>>   #define H_INT_SYNC              0x3CC
>>   #define H_INT_RESET             0x3D0
>> +#define H_SCM_READ_METADATA     0x3E4
>> +#define H_SCM_WRITE_METADATA    0x3E8
>> +#define H_SCM_BIND_MEM          0x3EC
>> +#define H_SCM_UNBIND_MEM        0x3F0
>>   
>> -#define MAX_HCALL_OPCODE        H_INT_RESET
>> +#define MAX_HCALL_OPCODE        H_SCM_UNBIND_MEM
>>   
>>   /* The hcalls above are standardized in PAPR and implemented by pHyp
>>    * as well.
>>
Shivaprasad G Bhat July 11, 2019, 5:51 a.m. UTC | #6
Thanks for the comments Fabiano.


On 05/22/2019 11:38 PM, Fabiano Rosas wrote:
> Shivaprasad G Bhat <sbhat@linux.ibm.com> writes:
>
> +
>> +    ddc = NVDIMM_GET_CLASS(nvdimm);
>> +    ddc->read_label_data(nvdimm, &args[0], numBytesToRead, offset);
>> +
>> +    return H_SUCCESS;
>> +}
>> +
>> +
>> +static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
>> +                                         SpaprMachineState *spapr,
>> +                                         target_ulong opcode,
>> +                                         target_ulong *args)
>> +{
>> +    uint32_t drc_index = args[0];
>> +    uint64_t offset = args[1];
>> +    uint64_t data = args[2];
>> +    int8_t numBytesToWrite = args[3];
> Likewise.

This is supposed to be uint64_t like used in other places.
Will fix it in the next version.

Rest of the comments I think David has already answered the rational
behind the usage to avoid integer overflow.

>
>> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
>> +    NVDIMMDevice *nvdimm = NULL;
>> +    DeviceState *dev = NULL;
>>

Regards,
Shivaprasad
diff mbox series

Patch

diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 6c16d2b120..b6e7d04dcf 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -3,11 +3,13 @@ 
 #include "sysemu/hw_accel.h"
 #include "sysemu/sysemu.h"
 #include "qemu/log.h"
+#include "qemu/range.h"
 #include "qemu/error-report.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "helper_regs.h"
 #include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_cpu_core.h"
 #include "mmu-hash64.h"
 #include "cpu-models.h"
@@ -16,6 +18,7 @@ 
 #include "hw/ppc/spapr_ovec.h"
 #include "mmu-book3s-v3.h"
 #include "hw/mem/memory-device.h"
+#include "hw/mem/nvdimm.h"
 
 static bool has_spr(PowerPCCPU *cpu, int spr)
 {
@@ -1795,6 +1798,199 @@  static target_ulong h_update_dt(PowerPCCPU *cpu, SpaprMachineState *spapr,
     return H_SUCCESS;
 }
 
+static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
+                                        SpaprMachineState *spapr,
+                                        target_ulong opcode,
+                                        target_ulong *args)
+{
+    uint32_t drc_index = args[0];
+    uint64_t offset = args[1];
+    uint64_t numBytesToRead = args[2];
+    SpaprDrc *drc = spapr_drc_by_index(drc_index);
+    NVDIMMDevice *nvdimm = NULL;
+    NVDIMMClass *ddc = NULL;
+
+    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+        return H_PARAMETER;
+    }
+
+    if (numBytesToRead != 1 && numBytesToRead != 2 &&
+        numBytesToRead != 4 && numBytesToRead != 8) {
+        return H_P3;
+    }
+
+    nvdimm = NVDIMM(drc->dev);
+    if ((offset + numBytesToRead < offset) ||
+        (nvdimm->label_size < numBytesToRead + offset)) {
+        return H_P2;
+    }
+
+    ddc = NVDIMM_GET_CLASS(nvdimm);
+    ddc->read_label_data(nvdimm, &args[0], numBytesToRead, offset);
+
+    return H_SUCCESS;
+}
+
+
+static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
+                                         SpaprMachineState *spapr,
+                                         target_ulong opcode,
+                                         target_ulong *args)
+{
+    uint32_t drc_index = args[0];
+    uint64_t offset = args[1];
+    uint64_t data = args[2];
+    int8_t numBytesToWrite = args[3];
+    SpaprDrc *drc = spapr_drc_by_index(drc_index);
+    NVDIMMDevice *nvdimm = NULL;
+    DeviceState *dev = NULL;
+    NVDIMMClass *ddc = NULL;
+
+    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+        return H_PARAMETER;
+    }
+
+    if (numBytesToWrite != 1 && numBytesToWrite != 2 &&
+        numBytesToWrite != 4 && numBytesToWrite != 8) {
+        return H_P4;
+    }
+
+    dev = drc->dev;
+    nvdimm = NVDIMM(dev);
+    if ((nvdimm->label_size < numBytesToWrite + offset) ||
+        (offset + numBytesToWrite < offset)) {
+        return H_P2;
+    }
+
+    ddc = NVDIMM_GET_CLASS(nvdimm);
+    ddc->write_label_data(nvdimm, &data, numBytesToWrite, offset);
+
+    return H_SUCCESS;
+}
+
+static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
+                                        target_ulong opcode,
+                                        target_ulong *args)
+{
+    uint32_t drc_index = args[0];
+    uint64_t starting_idx = args[1];
+    uint64_t no_of_scm_blocks_to_bind = args[2];
+    uint64_t target_logical_mem_addr = args[3];
+    uint64_t continue_token = args[4];
+    uint64_t size;
+    uint64_t total_no_of_scm_blocks;
+
+    SpaprDrc *drc = spapr_drc_by_index(drc_index);
+    hwaddr addr;
+    DeviceState *dev = NULL;
+    PCDIMMDevice *dimm = NULL;
+    Error *local_err = NULL;
+
+    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+        return H_PARAMETER;
+    }
+
+    dev = drc->dev;
+    dimm = PC_DIMM(dev);
+
+    size = object_property_get_uint(OBJECT(dimm),
+                                    PC_DIMM_SIZE_PROP, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return H_PARAMETER;
+    }
+
+    total_no_of_scm_blocks = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
+
+    if ((starting_idx > total_no_of_scm_blocks) ||
+        (no_of_scm_blocks_to_bind > total_no_of_scm_blocks)) {
+        return H_P2;
+    }
+
+    if (((starting_idx + no_of_scm_blocks_to_bind) < starting_idx) ||
+        ((starting_idx + no_of_scm_blocks_to_bind) > total_no_of_scm_blocks)) {
+        return H_P3;
+    }
+
+    /* Currently qemu assigns the address. */
+    if (target_logical_mem_addr != 0xffffffffffffffff) {
+        return H_OVERLAP;
+    }
+
+    /*
+     * Currently continue token should be zero qemu has already bound
+     * everything and this hcall doesnt return H_BUSY.
+     */
+    if (continue_token > 0) {
+        return H_P5;
+    }
+
+    /* NB : Already bound, Return target logical address in R4 */
+    addr = object_property_get_uint(OBJECT(dimm),
+                                    PC_DIMM_ADDR_PROP, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return H_PARAMETER;
+    }
+
+    args[1] = addr;
+    args[2] = no_of_scm_blocks_to_bind;
+
+    return H_SUCCESS;
+}
+
+static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
+                                        target_ulong opcode,
+                                        target_ulong *args)
+{
+    uint32_t drc_index = args[0];
+    uint64_t starting_scm_logical_addr = args[1];
+    uint64_t no_of_scm_blocks_to_unbind = args[2];
+    uint64_t size_to_unbind;
+    uint64_t continue_token = args[3];
+    Range blockrange = range_empty;
+    Range nvdimmrange = range_empty;
+    SpaprDrc *drc = spapr_drc_by_index(drc_index);
+    DeviceState *dev = NULL;
+    PCDIMMDevice *dimm = NULL;
+    uint64_t size, addr;
+
+    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+        return H_PARAMETER;
+    }
+
+    /* Check if starting_scm_logical_addr is block aligned */
+    if (!QEMU_IS_ALIGNED(starting_scm_logical_addr,
+                         SPAPR_MINIMUM_SCM_BLOCK_SIZE)) {
+        return H_P2;
+    }
+
+    dev = drc->dev;
+    dimm = PC_DIMM(dev);
+    size = object_property_get_int(OBJECT(dimm), PC_DIMM_SIZE_PROP, NULL);
+    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, NULL);
+
+    range_init_nofail(&nvdimmrange, addr, size);
+
+    size_to_unbind = no_of_scm_blocks_to_unbind * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
+
+
+    range_init_nofail(&blockrange, starting_scm_logical_addr, size_to_unbind);
+
+    if (!range_contains_range(&nvdimmrange, &blockrange)) {
+        return H_P3;
+    }
+
+    if (continue_token > 0) {
+        return H_P3;
+    }
+
+    args[1] = no_of_scm_blocks_to_unbind;
+
+    /*NB : dont do anything, let object_del take care of this for now. */
+    return H_SUCCESS;
+}
+
 static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
 static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX - KVMPPC_HCALL_BASE + 1];
 
@@ -1894,6 +2090,12 @@  static void hypercall_register_types(void)
     /* qemu/KVM-PPC specific hcalls */
     spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas);
 
+    /* qemu/scm specific hcalls */
+    spapr_register_hypercall(H_SCM_READ_METADATA, h_scm_read_metadata);
+    spapr_register_hypercall(H_SCM_WRITE_METADATA, h_scm_write_metadata);
+    spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
+    spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
+
     /* ibm,client-architecture-support support */
     spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
 
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 394ea26335..48e2cc9d67 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -283,6 +283,7 @@  struct SpaprMachineState {
 #define H_P7              -60
 #define H_P8              -61
 #define H_P9              -62
+#define H_OVERLAP         -68
 #define H_UNSUPPORTED_FLAG -256
 #define H_MULTI_THREADS_ACTIVE -9005
 
@@ -490,8 +491,12 @@  struct SpaprMachineState {
 #define H_INT_ESB               0x3C8
 #define H_INT_SYNC              0x3CC
 #define H_INT_RESET             0x3D0
+#define H_SCM_READ_METADATA     0x3E4
+#define H_SCM_WRITE_METADATA    0x3E8
+#define H_SCM_BIND_MEM          0x3EC
+#define H_SCM_UNBIND_MEM        0x3F0
 
-#define MAX_HCALL_OPCODE        H_INT_RESET
+#define MAX_HCALL_OPCODE        H_SCM_UNBIND_MEM
 
 /* The hcalls above are standardized in PAPR and implemented by pHyp
  * as well.