diff mbox

[RFC,v2,22/23] spapr: Support ibm, dynamic-reconfiguration-memory

Message ID 20150330091150.GB4753@in.ibm.com
State New
Headers show

Commit Message

Bharata B Rao March 30, 2015, 9:11 a.m. UTC
On Thu, Mar 26, 2015 at 02:44:17PM +1100, David Gibson wrote:
> > +/*
> > + * TODO: Take care of sparsemem configuration ?
> > + */
> > +static uint64_t numa_node_end(uint32_t nodeid)
> > +{
> > +    uint32_t i = 0;
> > +    uint64_t addr = 0;
> > +
> > +    do {
> > +        addr += numa_info[i].node_mem;
> > +    } while (++i <= nodeid);
> > +
> > +    return addr;
> > +}
> > +
> > +static uint64_t numa_node_start(uint32_t nodeid)
> > +{
> > +    if (!nodeid) {
> > +        return 0;
> > +    } else {
> > +        return numa_node_end(nodeid - 1);
> > +    }
> > +}
> > +
> > +/*
> > + * Given the addr, return the NUMA node to which the address belongs to.
> > + */
> > +static uint32_t get_numa_node(uint64_t addr)
> > +{
> > +    uint32_t i;
> > +
> > +    for (i = 0; i < nb_numa_nodes; i++) {
> > +        if ((addr >= numa_node_start(i)) && (addr < numa_node_end(i))) {
> > +            return i;
> > +        }
> > +    }
> 
> This function is O(N^2) in number of nodes, which is a bit hideous for
> something so simple.

Will something like below work ? Will all archs be ok with this ?

numa: Store start and end address range of each node in numa_info

Keep track of start and end address of each NUMA node in numa_info
structure so that lookup of node by address becomes easier.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---
 include/sysemu/numa.h |    3 +++
 numa.c                |   28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

> > +    /* Unassigned memory goes to node 0 by default */
> > +    return 0;
> > +}
> > +
> > +/*
> > + * Adds ibm,dynamic-reconfiguration-memory node.
> > + * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
> > + * of this device tree node.
> > + */
> > +static int spapr_populate_drconf_memory(sPAPREnvironment *spapr, void *fdt)
> > +{
> > +    int ret, i, offset;
> > +    uint32_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
> > +    uint32_t nr_rma_lmbs = spapr->rma_size/lmb_size;
> > +    uint32_t nr_lmbs = spapr->maxram_limit/lmb_size - nr_rma_lmbs;
> > +    uint32_t nr_assigned_lmbs = spapr->ram_limit/lmb_size - nr_rma_lmbs;
> > +    uint32_t *int_buf, *cur_index, buf_len;
> > +
> > +    /* Allocate enough buffer size to fit in ibm,dynamic-memory */
> > +    buf_len = nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE * sizeof(uint32_t) +
> > +                sizeof(uint32_t);
> > +    cur_index = int_buf = g_malloc0(buf_len);
> > +
> > +    offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
> > +
> > +    ret = fdt_setprop_u64(fdt, offset, "ibm,lmb-size", lmb_size);
> > +    if (ret < 0) {
> > +        goto out;
> > +    }
> > +
> > +    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
> > +    if (ret < 0) {
> > +        goto out;
> > +    }
> > +
> > +    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
> > +    if (ret < 0) {
> > +        goto out;
> > +    }
> > +
> > +    /* ibm,dynamic-memory */
> > +    int_buf[0] = cpu_to_be32(nr_lmbs);
> > +    cur_index++;
> > +    for (i = 0; i < nr_lmbs; i++) {
> > +        sPAPRDRConnector *drc;
> > +        sPAPRDRConnectorClass *drck;
> > +        uint64_t addr;
> > +        uint32_t *dynamic_memory = cur_index;
> > +
> > +        if (i < nr_assigned_lmbs) {
> > +            addr = (i + nr_rma_lmbs) * lmb_size;
> > +        } else {
> > +            addr = (i - nr_assigned_lmbs) * lmb_size +
> > +                SPAPR_MACHINE(qdev_get_machine())->hotplug_memory_base;
> > +        }
> > +        drc = spapr_dr_connector_new(qdev_get_machine(),
> > +                SPAPR_DR_CONNECTOR_TYPE_LMB, addr/lmb_size);
> > +        drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > +
> > +        dynamic_memory[0] = cpu_to_be32(addr >> 32);
> > +        dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
> > +        dynamic_memory[2] = cpu_to_be32(drck->get_index(drc));
> > +        dynamic_memory[3] = cpu_to_be32(0); /* reserved */
> > +        dynamic_memory[4] = cpu_to_be32(get_numa_node(addr));
> > +        dynamic_memory[5] = (addr < spapr->ram_limit) ?
> > +                            cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED) :
> > +                            cpu_to_be32(0);
> > +
> > +        cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
> > +    }
> > +    ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
> > +    if (ret < 0) {
> > +        goto out;
> > +    }
> > +
> > +    /* ibm,associativity-lookup-arrays */
> > +    cur_index = int_buf;
> > +    int_buf[0] = cpu_to_be32(nb_numa_nodes);
> > +    int_buf[1] = cpu_to_be32(4); /* Number of entries per associativity list */
> > +    cur_index += 2;
> > +    for (i = 0; i < nb_numa_nodes; i++) {
> > +        uint32_t associativity[] = {
> > +            cpu_to_be32(0x0),
> > +            cpu_to_be32(0x0),
> > +            cpu_to_be32(0x0),
> > +            cpu_to_be32(i)
> > +        };
> > +        memcpy(cur_index, associativity, sizeof(associativity));
> > +        cur_index += 4;
> > +    }
> > +    ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf,
> > +            (cur_index - int_buf) * sizeof(uint32_t));
> > +out:
> > +    g_free(int_buf);
> > +    return ret;
> > +}
> > +
> > +int spapr_h_cas_compose_response(target_ulong addr, target_ulong size,
> > +                                bool cpu_update, bool memory_update)
> > +{
> > +    void *fdt, *fdt_skel;
> > +    sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };
> > +
> > +    size -= sizeof(hdr);
> > +
> > +    /* Create sceleton */
> > +    fdt_skel = g_malloc0(size);
> > +    _FDT((fdt_create(fdt_skel, size)));
> > +    _FDT((fdt_begin_node(fdt_skel, "")));
> > +    _FDT((fdt_end_node(fdt_skel)));
> > +    _FDT((fdt_finish(fdt_skel)));
> > +    fdt = g_malloc0(size);
> > +    _FDT((fdt_open_into(fdt_skel, fdt, size)));
> > +    g_free(fdt_skel);
> > +
> > +    /* Fixup cpu nodes */
> > +    if (cpu_update) {
> > +        _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
> > +    }
> > +
> > +    /* Generate memory nodes or ibm,dynamic-reconfiguration-memory node */
> > +    if (memory_update) {
> > +        _FDT((spapr_populate_drconf_memory(spapr, fdt)));
> > +    } else {
> > +        _FDT((spapr_populate_memory(spapr, fdt)));
> > +    }
> > +
> > +    /* Pack resulting tree */
> > +    _FDT((fdt_pack(fdt)));
> > +
> > +    if (fdt_totalsize(fdt) + sizeof(hdr) > size) {
> > +        trace_spapr_cas_failed(size);
> > +        return -1;
> > +    }
> > +
> > +    cpu_physical_memory_write(addr, &hdr, sizeof(hdr));
> > +    cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt));
> > +    trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
> > +    g_free(fdt);
> > +
> > +    return 0;
> > +}
> > +
> >  static void spapr_finalize_fdt(sPAPREnvironment *spapr,
> >                                 hwaddr fdt_addr,
> >                                 hwaddr rtas_addr,
> > @@ -791,11 +934,12 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr,
> >      /* open out the base tree into a temp buffer for the final tweaks */
> >      _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE)));
> >  
> > -    ret = spapr_populate_memory(spapr, fdt);
> > -    if (ret < 0) {
> > -        fprintf(stderr, "couldn't setup memory nodes in fdt\n");
> > -        exit(1);
> > -    }
> > +    /*
> > +     * Add memory@0 node to represent RMA. Rest of the memory is either
> > +     * represented by memory nodes or ibm,dynamic-reconfiguration-memory
> > +     * node later during ibm,client-architecture-support call.
> > +     */
> > +    spapr_populate_memory_node(fdt, 0, 0, spapr->rma_size);
> >  
> >      ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
> >      if (ret < 0) {
> > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> > index 4f76f1c..20507c6 100644
> > --- a/hw/ppc/spapr_hcall.c
> > +++ b/hw/ppc/spapr_hcall.c
> > @@ -807,6 +807,32 @@ static target_ulong h_set_mode(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> >      return ret;
> >  }
> >  
> > +/*
> > + * Return the offset to the requested option vector @vector in the
> > + * option vector table @table.
> > + */
> > +static target_ulong cas_get_option_vector(int vector, target_ulong table)
> > +{
> > +    int i;
> > +    char nr_vectors, nr_entries;
> > +
> > +    if (!table) {
> > +        return 0;
> > +    }
> > +
> > +    nr_vectors = (rtas_ld(table, 0) >> 24) + 1;
> 
> I don't think rtas_ld() should be used outside its intended function
> in rtas.  Make a direct call to ldl_phys instead.

Ok, but @table here is an RTAS arg passed from the main RTAS routine to this
function. Still can't use rtas_ld() ?

Regards,
Bharata.

Comments

David Gibson March 31, 2015, 2:19 a.m. UTC | #1
On Mon, Mar 30, 2015 at 02:41:50PM +0530, Bharata B Rao wrote:
> On Thu, Mar 26, 2015 at 02:44:17PM +1100, David Gibson wrote:
> > > +/*
> > > + * TODO: Take care of sparsemem configuration ?
> > > + */
> > > +static uint64_t numa_node_end(uint32_t nodeid)
> > > +{
> > > +    uint32_t i = 0;
> > > +    uint64_t addr = 0;
> > > +
> > > +    do {
> > > +        addr += numa_info[i].node_mem;
> > > +    } while (++i <= nodeid);
> > > +
> > > +    return addr;
> > > +}
> > > +
> > > +static uint64_t numa_node_start(uint32_t nodeid)
> > > +{
> > > +    if (!nodeid) {
> > > +        return 0;
> > > +    } else {
> > > +        return numa_node_end(nodeid - 1);
> > > +    }
> > > +}
> > > +
> > > +/*
> > > + * Given the addr, return the NUMA node to which the address belongs to.
> > > + */
> > > +static uint32_t get_numa_node(uint64_t addr)
> > > +{
> > > +    uint32_t i;
> > > +
> > > +    for (i = 0; i < nb_numa_nodes; i++) {
> > > +        if ((addr >= numa_node_start(i)) && (addr < numa_node_end(i))) {
> > > +            return i;
> > > +        }
> > > +    }
> > 
> > This function is O(N^2) in number of nodes, which is a bit hideous for
> > something so simple.
> 
> Will something like below work ? Will all archs be ok with this ?

The below looks like a reasonable idea to me, but it's really the call
of the numa and memory subsystem people.

Paolo, do you have an opinion?  The basic problem here is to have a
function which returns which NUMA node a given address lies in.

> numa: Store start and end address range of each node in numa_info
> 
> Keep track of start and end address of each NUMA node in numa_info
> structure so that lookup of node by address becomes easier.
> 
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> ---
>  include/sysemu/numa.h |    3 +++
>  numa.c                |   28 ++++++++++++++++++++++++++++
>  2 files changed, 31 insertions(+)
> 
> diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
> index 5633b85..6dd6387 100644
> --- a/include/sysemu/numa.h
> +++ b/include/sysemu/numa.h
> @@ -14,11 +14,14 @@ typedef struct node_info {
>      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
>      struct HostMemoryBackend *node_memdev;
>      bool present;
> +    uint64_t mem_start;
> +    uint64_t mem_end;

I suspect these should be hwaddr, or possible ram_addr_t though.

[snip]
> > I don't think rtas_ld() should be used outside its intended function
> > in rtas.  Make a direct call to ldl_phys instead.
> 
> Ok, but @table here is an RTAS arg passed from the main RTAS routine to this
> function. Still can't use rtas_ld() ?

So, rtas_ld() was never intended as a general purpose load function
for RTAS use, but specifically for loading arguments from an RTAS
style argument list.

It's already being used for more than that now in the RTAS code, which
I'm not entirely comfortable with.  I'd prefer not to extend its use
any further.
diff mbox

Patch

diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 5633b85..6dd6387 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -14,11 +14,14 @@  typedef struct node_info {
     DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
     struct HostMemoryBackend *node_memdev;
     bool present;
+    uint64_t mem_start;
+    uint64_t mem_end;
 } NodeInfo;
 extern NodeInfo numa_info[MAX_NODES];
 void parse_numa_opts(void);
 void numa_post_machine_init(void);
 void query_numa_node_mem(uint64_t node_mem[]);
 extern QemuOptsList qemu_numa_opts;
+uint32_t get_numa_node(uint64_t addr, Error **errp);
 
 #endif
diff --git a/numa.c b/numa.c
index 5634bf0..d0eb647 100644
--- a/numa.c
+++ b/numa.c
@@ -53,6 +53,25 @@  static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
 int nb_numa_nodes;
 NodeInfo numa_info[MAX_NODES];
 
+/*
+ * Given an address, return the index of the NUMA node to which the
+ * address belongs to.
+ */
+uint32_t get_numa_node(uint64_t addr, Error **errp)
+{
+    uint32_t i;
+
+    for (i = 0; i < nb_numa_nodes; i++) {
+        if (addr >= numa_info[i].mem_start && addr < numa_info[i].mem_end) {
+            return i;
+        }
+    }
+    error_setg(errp, "Address 0x" RAM_ADDR_FMT " doesn't belong to any NUMA node", addr);
+
+    /* Return Node 0 for unclaimed address */
+    return 0;
+}
+
 static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
 {
     uint16_t nodenr;
@@ -119,6 +138,15 @@  static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
         numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
     }
+
+    if (nodenr) {
+        numa_info[nodenr].mem_start = numa_info[nodenr-1].mem_end;
+    } else {
+        numa_info[nodenr].mem_start = 0;
+    }
+    numa_info[nodenr].mem_end = numa_info[nodenr].mem_start +
+                                   numa_info[nodenr].node_mem;
+
     numa_info[nodenr].present = true;
     max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
 }