diff mbox

[RFC,v0] numa: API to lookup NUMA node by address

Message ID 1430982264-25497-1-git-send-email-bharata@linux.vnet.ibm.com
State New
Headers show

Commit Message

Bharata B Rao May 7, 2015, 7:04 a.m. UTC
Keep track of start and end address of each NUMA node in numa_info
structure so that lookup of node by address becomes easier. Add
an API numa_get_node() to lookup a node by address.

This is needed by sPAPR PowerPC to support
ibm,dynamic-reconfiguration-memory device tree node which is needed for
memory hotplug.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
This patch was earlier posted as part of sPAPR hotplug patchset here:
https://lists.gnu.org/archive/html/qemu-ppc/2015-04/msg00204.html

 include/sysemu/numa.h |  3 +++
 numa.c                | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

Comments

Bharata B Rao May 13, 2015, 3:33 p.m. UTC | #1
CC'ing Eduardo...

Does this approach look sane ? Currently only PowerPC needs this, however
is this API correct from other architectures' perspective ?

On Thu, May 07, 2015 at 12:34:24PM +0530, Bharata B Rao wrote:
> Keep track of start and end address of each NUMA node in numa_info
> structure so that lookup of node by address becomes easier. Add
> an API numa_get_node() to lookup a node by address.
> 
> This is needed by sPAPR PowerPC to support
> ibm,dynamic-reconfiguration-memory device tree node which is needed for
> memory hotplug.
> 
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> This patch was earlier posted as part of sPAPR hotplug patchset here:
> https://lists.gnu.org/archive/html/qemu-ppc/2015-04/msg00204.html
> 
>  include/sysemu/numa.h |  3 +++
>  numa.c                | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 64 insertions(+)
> 
> diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
> index 6523b4d..19c0ba3 100644
> --- a/include/sysemu/numa.h
> +++ b/include/sysemu/numa.h
> @@ -15,11 +15,14 @@ typedef struct node_info {
>      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
>      struct HostMemoryBackend *node_memdev;
>      bool present;
> +    ram_addr_t mem_start;
> +    ram_addr_t mem_end;
>  } NodeInfo;
>  extern NodeInfo numa_info[MAX_NODES];
>  void parse_numa_opts(MachineClass *mc);
>  void numa_post_machine_init(void);
>  void query_numa_node_mem(uint64_t node_mem[]);
>  extern QemuOptsList qemu_numa_opts;
> +uint32_t numa_get_node(ram_addr_t addr, Error **errp);
> 
>  #endif
> diff --git a/numa.c b/numa.c
> index c975fb2..fdf333b 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -53,6 +53,63 @@ static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
>  int nb_numa_nodes;
>  NodeInfo numa_info[MAX_NODES];
> 
> +/*
> + * Given an address, return the index of the NUMA node to which the
> + * address belongs to.
> + */
> +uint32_t numa_get_node(ram_addr_t addr, Error **errp)
> +{
> +    uint32_t i;
> +    MemoryDeviceInfoList *info_list = NULL;
> +    MemoryDeviceInfoList **prev = &info_list;
> +    MemoryDeviceInfoList *info;
> +
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        if (addr >= numa_info[i].mem_start && addr < numa_info[i].mem_end) {
> +            return i;
> +        }
> +    }
> +
> +    /*
> +     * If this @addr falls under cold or hotplugged memory regions,
> +     * check there too.
> +     */
> +    qmp_pc_dimm_device_list(qdev_get_machine(), &prev);
> +    for (info = info_list; info; info = info->next) {
> +        MemoryDeviceInfo *value = info->value;
> +
> +        if (value) {
> +            switch (value->kind) {
> +            case MEMORY_DEVICE_INFO_KIND_DIMM:
> +                if (addr >= value->dimm->addr &&
> +                        addr < (value->dimm->addr + value->dimm->size)) {
> +                    qapi_free_MemoryDeviceInfoList(info_list);
> +                    return value->dimm->node;
> +                }
> +                break;
> +            default:
> +                break;
> +            }
> +        }
> +    }
> +    qapi_free_MemoryDeviceInfoList(info_list);
> +    error_setg(errp, "Address 0x" RAM_ADDR_FMT " doesn't belong to any "
> +                "NUMA node", addr);
> +
> +    return -1;
> +}
> +
> +static void numa_set_mem_address(int nodenr)
> +{
> +    if (nodenr) {
> +        numa_info[nodenr].mem_start = numa_info[nodenr-1].mem_end;
> +    } else {
> +        numa_info[nodenr].mem_start = 0;
> +    }
> +    numa_info[nodenr].mem_end = numa_info[nodenr].mem_start +
> +                                   numa_info[nodenr].node_mem;
> +}
> +
>  static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
>  {
>      uint16_t nodenr;
> @@ -276,6 +333,10 @@ void parse_numa_opts(MachineClass *mc)
>          }
> 
>          for (i = 0; i < nb_numa_nodes; i++) {
> +            numa_set_mem_address(i);
> +        }
> +
> +        for (i = 0; i < nb_numa_nodes; i++) {
>              if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
>                  break;
>              }
> -- 
> 2.1.0
Eduardo Habkost May 13, 2015, 6:06 p.m. UTC | #2
On Thu, May 07, 2015 at 12:34:24PM +0530, Bharata B Rao wrote:
> Keep track of start and end address of each NUMA node in numa_info
> structure so that lookup of node by address becomes easier. Add
> an API numa_get_node() to lookup a node by address.
> 
> This is needed by sPAPR PowerPC to support
> ibm,dynamic-reconfiguration-memory device tree node which is needed for
> memory hotplug.
> 
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> This patch was earlier posted as part of sPAPR hotplug patchset here:
> https://lists.gnu.org/archive/html/qemu-ppc/2015-04/msg00204.html
> 
>  include/sysemu/numa.h |  3 +++
>  numa.c                | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 64 insertions(+)
> 
> diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
> index 6523b4d..19c0ba3 100644
> --- a/include/sysemu/numa.h
> +++ b/include/sysemu/numa.h
> @@ -15,11 +15,14 @@ typedef struct node_info {
>      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
>      struct HostMemoryBackend *node_memdev;
>      bool present;
> +    ram_addr_t mem_start;
> +    ram_addr_t mem_end;
>  } NodeInfo;
>  extern NodeInfo numa_info[MAX_NODES];
>  void parse_numa_opts(MachineClass *mc);
>  void numa_post_machine_init(void);
>  void query_numa_node_mem(uint64_t node_mem[]);
>  extern QemuOptsList qemu_numa_opts;
> +uint32_t numa_get_node(ram_addr_t addr, Error **errp);
>  
>  #endif
> diff --git a/numa.c b/numa.c
> index c975fb2..fdf333b 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -53,6 +53,63 @@ static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
>  int nb_numa_nodes;
>  NodeInfo numa_info[MAX_NODES];
>  
> +/*
> + * Given an address, return the index of the NUMA node to which the
> + * address belongs to.
> + */
> +uint32_t numa_get_node(ram_addr_t addr, Error **errp)
> +{
> +    uint32_t i;
> +    MemoryDeviceInfoList *info_list = NULL;
> +    MemoryDeviceInfoList **prev = &info_list;
> +    MemoryDeviceInfoList *info;
> +
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        if (addr >= numa_info[i].mem_start && addr < numa_info[i].mem_end) {
> +            return i;
> +        }
> +    }
> +
> +    /*
> +     * If this @addr falls under cold or hotplugged memory regions,
> +     * check there too.
> +     */
> +    qmp_pc_dimm_device_list(qdev_get_machine(), &prev);
> +    for (info = info_list; info; info = info->next) {
> +        MemoryDeviceInfo *value = info->value;
> +
> +        if (value) {
> +            switch (value->kind) {
> +            case MEMORY_DEVICE_INFO_KIND_DIMM:
> +                if (addr >= value->dimm->addr &&
> +                        addr < (value->dimm->addr + value->dimm->size)) {
> +                    qapi_free_MemoryDeviceInfoList(info_list);
> +                    return value->dimm->node;
> +                }
> +                break;
> +            default:
> +                break;
> +            }
> +        }
> +    }

I am bothered that we need to use two different methods to lookup the
NUMA node, and that we scan the whole list of /machine children every
time numa_get_node() is called.

Also, this introduces a circular dependency between pc-dimm.c and
numa.c. Instead of that, pc-dimm could simply notify us when a new
device is realized (with just (addr, end, node) as arguments), so we can
save the list of memory ranges inside struct node_info.

I wonder if the memory API already provides something that would help
us. Paolo, do you see a way we could simply use a MemoryRegion as input
to lookup the NUMA node?


> +    qapi_free_MemoryDeviceInfoList(info_list);
> +    error_setg(errp, "Address 0x" RAM_ADDR_FMT " doesn't belong to any "
> +                "NUMA node", addr);
> +
> +    return -1;
> +}
> +
> +static void numa_set_mem_address(int nodenr)
> +{
> +    if (nodenr) {
> +        numa_info[nodenr].mem_start = numa_info[nodenr-1].mem_end;

You isolated the code inside a function, but it requires the function to
be called in a specific nodenr order. I would just make it a loop that
calculates mem_start and mem_end for all nodes, then you won't need a
special case for node 0.

> +    } else {
> +        numa_info[nodenr].mem_start = 0;
> +    }
> +    numa_info[nodenr].mem_end = numa_info[nodenr].mem_start +
> +                                   numa_info[nodenr].node_mem;

Now that we have specific fields for the memory ranges, it would be
interesting to reuse mem_start and mem_end inside
memory_region_allocate_system_memory() instead of duplicating the
address calculation there.

> +}
> +
>  static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
>  {
>      uint16_t nodenr;
> @@ -276,6 +333,10 @@ void parse_numa_opts(MachineClass *mc)
>          }
>  
>          for (i = 0; i < nb_numa_nodes; i++) {
> +            numa_set_mem_address(i);
> +        }
> +
> +        for (i = 0; i < nb_numa_nodes; i++) {
>              if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
>                  break;
>              }
> -- 
> 2.1.0
> 
>
Paolo Bonzini May 14, 2015, 9:39 a.m. UTC | #3
On 13/05/2015 20:06, Eduardo Habkost wrote:
> Also, this introduces a circular dependency between pc-dimm.c and
> numa.c. Instead of that, pc-dimm could simply notify us when a new
> device is realized (with just (addr, end, node) as arguments), so we can
> save the list of memory ranges inside struct node_info.
> 
> I wonder if the memory API already provides something that would help
> us. Paolo, do you see a way we could simply use a MemoryRegion as input
> to lookup the NUMA node?

No, but I guess you could add a numa_get/set_memory_region_node_id API
that uses a hash table.  That's a variant of the "pc-dimm could simply
notify" numa.c that you propose above.

Paolo

> 
>> +    qapi_free_MemoryDeviceInfoList(info_list);
>> +    error_setg(errp, "Address 0x" RAM_ADDR_FMT " doesn't belong to any "
>> +                "NUMA node", addr);
>> +
>> +    return -1;
>> +}
>> +
>> +static void numa_set_mem_address(int nodenr)
>> +{
>> +    if (nodenr) {
>> +        numa_info[nodenr].mem_start = numa_info[nodenr-1].mem_end;
> 
> You isolated the code inside a function, but it requires the function to
> be called in a specific nodenr order. I would just make it a loop that
> calculates mem_start and mem_end for all nodes, then you won't need a
> special case for node 0.
> 
>> +    } else {
>> +        numa_info[nodenr].mem_start = 0;
>> +    }
>> +    numa_info[nodenr].mem_end = numa_info[nodenr].mem_start +
>> +                                   numa_info[nodenr].node_mem;
> 
> Now that we have specific fields for the memory ranges, it would be
> interesting to reuse mem_start and mem_end inside
> memory_region_allocate_system_memory() instead of duplicating the
> address calculation there.
> 
>> +}
>> +
>>  static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
>>  {
>>      uint16_t nodenr;
>> @@ -276,6 +333,10 @@ void parse_numa_opts(MachineClass *mc)
>>          }
>>  
>>          for (i = 0; i < nb_numa_nodes; i++) {
>> +            numa_set_mem_address(i);
>> +        }
>> +
>> +        for (i = 0; i < nb_numa_nodes; i++) {
>>              if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
>>                  break;
>>              }
>> -- 
>> 2.1.0
>>
>>
>
Bharata B Rao May 25, 2015, 7:47 a.m. UTC | #4
On Thu, May 14, 2015 at 11:39:06AM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/05/2015 20:06, Eduardo Habkost wrote:
> > Also, this introduces a circular dependency between pc-dimm.c and
> > numa.c. Instead of that, pc-dimm could simply notify us when a new
> > device is realized (with just (addr, end, node) as arguments), so we can
> > save the list of memory ranges inside struct node_info.
> > 
> > I wonder if the memory API already provides something that would help
> > us. Paolo, do you see a way we could simply use a MemoryRegion as input
> > to lookup the NUMA node?
> 
> No, but I guess you could add a numa_get/set_memory_region_node_id API
> that uses a hash table.  That's a variant of the "pc-dimm could simply
> notify" numa.c that you propose above.

While you say we can't use MemoryRegion as input to lookup the NUMA node,
you suggest that we add numa_get/set_memory_region_node_id. Does this API
get/set NUMA node id for the given MemoryRegion ? 

Regards,
Bharata.
Eduardo Habkost May 25, 2015, 5:42 p.m. UTC | #5
On Mon, May 25, 2015 at 01:17:57PM +0530, Bharata B Rao wrote:
> On Thu, May 14, 2015 at 11:39:06AM +0200, Paolo Bonzini wrote:
> > On 13/05/2015 20:06, Eduardo Habkost wrote:
> > > Also, this introduces a circular dependency between pc-dimm.c and
> > > numa.c. Instead of that, pc-dimm could simply notify us when a new
> > > device is realized (with just (addr, end, node) as arguments), so we can
> > > save the list of memory ranges inside struct node_info.
> > > 
> > > I wonder if the memory API already provides something that would help
> > > us. Paolo, do you see a way we could simply use a MemoryRegion as input
> > > to lookup the NUMA node?
> > 
> > No, but I guess you could add a numa_get/set_memory_region_node_id API
> > that uses a hash table.  That's a variant of the "pc-dimm could simply
> > notify" numa.c that you propose above.
> 
> While you say we can't use MemoryRegion as input to lookup the NUMA node,
> you suggest that we add numa_get/set_memory_region_node_id. Does this API
> get/set NUMA node id for the given MemoryRegion ? 

I was going to suggest that, but it would require changing the
non-memdev code path to create a MemoryRegion for each node, too. So
having a numa_set_mem_node_id(start_addr, end_addr, node_id) API would
be simpler.
Bharata B Rao June 8, 2015, 5:58 a.m. UTC | #6
On Mon, May 25, 2015 at 02:42:40PM -0300, Eduardo Habkost wrote:
> On Mon, May 25, 2015 at 01:17:57PM +0530, Bharata B Rao wrote:
> > On Thu, May 14, 2015 at 11:39:06AM +0200, Paolo Bonzini wrote:
> > > On 13/05/2015 20:06, Eduardo Habkost wrote:
> > > > Also, this introduces a circular dependency between pc-dimm.c and
> > > > numa.c. Instead of that, pc-dimm could simply notify us when a new
> > > > device is realized (with just (addr, end, node) as arguments), so we can
> > > > save the list of memory ranges inside struct node_info.
> > > > 
> > > > I wonder if the memory API already provides something that would help
> > > > us. Paolo, do you see a way we could simply use a MemoryRegion as input
> > > > to lookup the NUMA node?
> > > 
> > > No, but I guess you could add a numa_get/set_memory_region_node_id API
> > > that uses a hash table.  That's a variant of the "pc-dimm could simply
> > > notify" numa.c that you propose above.
> > 
> > While you say we can't use MemoryRegion as input to lookup the NUMA node,
> > you suggest that we add numa_get/set_memory_region_node_id. Does this API
> > get/set NUMA node id for the given MemoryRegion ? 
> 
> I was going to suggest that, but it would require changing the
> non-memdev code path to create a MemoryRegion for each node, too. So
> having a numa_set_mem_node_id(start_addr, end_addr, node_id) API would
> be simpler.

In order to save the list of memory ranges inside node_info, I tried this
approach where I call

numa_set_mem_node_id(dimm.addr, dimm.size, dimm.node) from

pc_dimm_realize(), but

the value of dimm.addr is finalized only later in ->plug().

So we would have to call this API from arch code like pc_dimm_plug().
Is that acceptable ?

Regards,
Bharata.
Igor Mammedov June 8, 2015, 9:51 a.m. UTC | #7
On Mon, 8 Jun 2015 11:28:18 +0530
Bharata B Rao <bharata@linux.vnet.ibm.com> wrote:

> On Mon, May 25, 2015 at 02:42:40PM -0300, Eduardo Habkost wrote:
> > On Mon, May 25, 2015 at 01:17:57PM +0530, Bharata B Rao wrote:
> > > On Thu, May 14, 2015 at 11:39:06AM +0200, Paolo Bonzini wrote:
> > > > On 13/05/2015 20:06, Eduardo Habkost wrote:
> > > > > Also, this introduces a circular dependency between pc-dimm.c and
> > > > > numa.c. Instead of that, pc-dimm could simply notify us when a new
> > > > > device is realized (with just (addr, end, node) as arguments), so we can
> > > > > save the list of memory ranges inside struct node_info.
> > > > > 
> > > > > I wonder if the memory API already provides something that would help
> > > > > us. Paolo, do you see a way we could simply use a MemoryRegion as input
> > > > > to lookup the NUMA node?
> > > > 
> > > > No, but I guess you could add a numa_get/set_memory_region_node_id API
> > > > that uses a hash table.  That's a variant of the "pc-dimm could simply
> > > > notify" numa.c that you propose above.
> > > 
> > > While you say we can't use MemoryRegion as input to lookup the NUMA node,
> > > you suggest that we add numa_get/set_memory_region_node_id. Does this API
> > > get/set NUMA node id for the given MemoryRegion ? 
> > 
> > I was going to suggest that, but it would require changing the
> > non-memdev code path to create a MemoryRegion for each node, too. So
> > having a numa_set_mem_node_id(start_addr, end_addr, node_id) API would
> > be simpler.
> 
> In order to save the list of memory ranges inside node_info, I tried this
> approach where I call
> 
> numa_set_mem_node_id(dimm.addr, dimm.size, dimm.node) from
> 
> pc_dimm_realize(), but
> 
> the value of dimm.addr is finalized only later in ->plug().
> 
> So we would have to call this API from arch code like pc_dimm_plug().
> Is that acceptable ?
Could you query pc_dimms' numa property each time you need mapping
instead of additionally storing that mapping elsewhere?

> 
> Regards,
> Bharata.
> 
>
Eduardo Habkost June 8, 2015, 3:51 p.m. UTC | #8
On Mon, Jun 08, 2015 at 11:51:03AM +0200, Igor Mammedov wrote:
> On Mon, 8 Jun 2015 11:28:18 +0530
> Bharata B Rao <bharata@linux.vnet.ibm.com> wrote:
> 
> > On Mon, May 25, 2015 at 02:42:40PM -0300, Eduardo Habkost wrote:
> > > On Mon, May 25, 2015 at 01:17:57PM +0530, Bharata B Rao wrote:
> > > > On Thu, May 14, 2015 at 11:39:06AM +0200, Paolo Bonzini wrote:
> > > > > On 13/05/2015 20:06, Eduardo Habkost wrote:
> > > > > > Also, this introduces a circular dependency between pc-dimm.c and
> > > > > > numa.c. Instead of that, pc-dimm could simply notify us when a new
> > > > > > device is realized (with just (addr, end, node) as arguments), so we can
> > > > > > save the list of memory ranges inside struct node_info.
> > > > > > 
> > > > > > I wonder if the memory API already provides something that would help
> > > > > > us. Paolo, do you see a way we could simply use a MemoryRegion as input
> > > > > > to lookup the NUMA node?
> > > > > 
> > > > > No, but I guess you could add a numa_get/set_memory_region_node_id API
> > > > > that uses a hash table.  That's a variant of the "pc-dimm could simply
> > > > > notify" numa.c that you propose above.
> > > > 
> > > > While you say we can't use MemoryRegion as input to lookup the NUMA node,
> > > > you suggest that we add numa_get/set_memory_region_node_id. Does this API
> > > > get/set NUMA node id for the given MemoryRegion ? 
> > > 
> > > I was going to suggest that, but it would require changing the
> > > non-memdev code path to create a MemoryRegion for each node, too. So
> > > having a numa_set_mem_node_id(start_addr, end_addr, node_id) API would
> > > be simpler.
> > 
> > In order to save the list of memory ranges inside node_info, I tried this
> > approach where I call
> > 
> > numa_set_mem_node_id(dimm.addr, dimm.size, dimm.node) from
> > 
> > pc_dimm_realize(), but
> > 
> > the value of dimm.addr is finalized only later in ->plug().
> > 
> > So we would have to call this API from arch code like pc_dimm_plug().
> > Is that acceptable ?

It looks acceptable to me, as pc.c already has all the rest of the
NUMA-specific code for PC. I believe it would be interesting to keep all
numa.o dependencies contained inside machine code.

> Could you query pc_dimms' numa property each time you need mapping
> instead of additionally storing that mapping elsewhere?

The original patch did that, but I suggested the
numa_set_mem_node_id() API for two reasons: 1) not requiring special
cases for hotplug inside numa_get_node(); 2) not introducing a circular
dependency between pc-dimm.c and numa.c.

Having a numa_set_memory_region_node_id(MemoryRegion *mr, int node) API
would probably be better, and make the discussion about pc_dimm.addr
moot. But it would require changing
memory_region_allocate_system_memory() to avoid
allocate_system_memory_nonnuma() even in the !have_memdevs case.
Paolo Bonzini June 8, 2015, 3:55 p.m. UTC | #9
On 08/06/2015 17:51, Eduardo Habkost wrote:
> 
> Having a numa_set_memory_region_node_id(MemoryRegion *mr, int node) API
> would probably be better, and make the discussion about pc_dimm.addr
> moot. But it would require changing
> memory_region_allocate_system_memory() to avoid
> allocate_system_memory_nonnuma() even in the !have_memdevs case.

This in turn may have complications due to migration.  We can do it later.

Paolo
Eduardo Habkost June 8, 2015, 4:09 p.m. UTC | #10
On Mon, Jun 08, 2015 at 05:55:10PM +0200, Paolo Bonzini wrote:
> On 08/06/2015 17:51, Eduardo Habkost wrote:
> > 
> > Having a numa_set_memory_region_node_id(MemoryRegion *mr, int node) API
> > would probably be better, and make the discussion about pc_dimm.addr
> > moot. But it would require changing
> > memory_region_allocate_system_memory() to avoid
> > allocate_system_memory_nonnuma() even in the !have_memdevs case.
> 
> This in turn may have complications due to migration.  We can do it later.

I agree we can do it later. But what kind of complications do you see?
Is the choice of MemoryRegion layout really supposed to affect
migration?

(Is it just about the single vmstate_register_ram_global() call at
allocate_system_memory_nonnuma()?)
Igor Mammedov June 9, 2015, 9:23 a.m. UTC | #11
On Mon, 8 Jun 2015 12:51:39 -0300
Eduardo Habkost <ehabkost@redhat.com> wrote:

> On Mon, Jun 08, 2015 at 11:51:03AM +0200, Igor Mammedov wrote:
> > On Mon, 8 Jun 2015 11:28:18 +0530
> > Bharata B Rao <bharata@linux.vnet.ibm.com> wrote:
> > 
> > > On Mon, May 25, 2015 at 02:42:40PM -0300, Eduardo Habkost wrote:
> > > > On Mon, May 25, 2015 at 01:17:57PM +0530, Bharata B Rao wrote:
> > > > > On Thu, May 14, 2015 at 11:39:06AM +0200, Paolo Bonzini wrote:
> > > > > > On 13/05/2015 20:06, Eduardo Habkost wrote:
> > > > > > > Also, this introduces a circular dependency between pc-dimm.c and
> > > > > > > numa.c. Instead of that, pc-dimm could simply notify us when a new
> > > > > > > device is realized (with just (addr, end, node) as arguments), so we can
> > > > > > > save the list of memory ranges inside struct node_info.
> > > > > > > 
> > > > > > > I wonder if the memory API already provides something that would help
> > > > > > > us. Paolo, do you see a way we could simply use a MemoryRegion as input
> > > > > > > to lookup the NUMA node?
> > > > > > 
> > > > > > No, but I guess you could add a numa_get/set_memory_region_node_id API
> > > > > > that uses a hash table.  That's a variant of the "pc-dimm could simply
> > > > > > notify" numa.c that you propose above.
> > > > > 
> > > > > While you say we can't use MemoryRegion as input to lookup the NUMA node,
> > > > > you suggest that we add numa_get/set_memory_region_node_id. Does this API
> > > > > get/set NUMA node id for the given MemoryRegion ? 
> > > > 
> > > > I was going to suggest that, but it would require changing the
> > > > non-memdev code path to create a MemoryRegion for each node, too. So
> > > > having a numa_set_mem_node_id(start_addr, end_addr, node_id) API would
> > > > be simpler.
> > > 
> > > In order to save the list of memory ranges inside node_info, I tried this
> > > approach where I call
> > > 
> > > numa_set_mem_node_id(dimm.addr, dimm.size, dimm.node) from
> > > 
> > > pc_dimm_realize(), but
> > > 
> > > the value of dimm.addr is finalized only later in ->plug().
> > > 
> > > So we would have to call this API from arch code like pc_dimm_plug().
> > > Is that acceptable ?
> 
> It looks acceptable to me, as pc.c already has all the rest of the
> NUMA-specific code for PC. I believe it would be interesting to keep all
> numa.o dependencies contained inside machine code.
> 
> > Could you query pc_dimms' numa property each time you need mapping
> > instead of additionally storing that mapping elsewhere?
> 
> The original patch did that, but I suggested the
> numa_set_mem_node_id() API for two reasons: 1) not requiring special
> cases for hotplug inside numa_get_node(); 2) not introducing a circular
> dependency between pc-dimm.c and numa.c.
What circular dependency doing foreach(pc-dimm) would introduce?
So far pc-dimm is independent from numa.c and a regular device with no
dependencies (except of on backend memdev) and has it's own 'numa' property
for providing that information to interested users. I'd rather keep
it separate form legacy numa.c:-numa handling.

The only thing I see for using -numa with -device pc-dimm is to define
possible fixed memory layout (pc-dimm <-> node mapping) at startup for
platforms that have to describe it at boot time. And even for that, I'd
suggest to use something like -numa nodeid=X,addr=Z,size=Y,dimm=ID and have
board code that does mapping to call numa.c:numa_get_layout_for_device(dimm_id)
and set received values on pc-dimm properties if plugged in dimm fits them.

What for one needs to pull dimm properties into numa.c?
 
> Having a numa_set_memory_region_node_id(MemoryRegion *mr, int node) API
> would probably be better, and make the discussion about pc_dimm.addr
Wouldn't it be layering violation if we put (frontend) nodeid information
into (backend) MemoryRegion?

> moot. But it would require changing
> memory_region_allocate_system_memory() to avoid
> allocate_system_memory_nonnuma() even in the !have_memdevs case.
>
Eduardo Habkost June 9, 2015, 12:40 p.m. UTC | #12
On Tue, Jun 09, 2015 at 11:23:19AM +0200, Igor Mammedov wrote:
> On Mon, 8 Jun 2015 12:51:39 -0300
> Eduardo Habkost <ehabkost@redhat.com> wrote:
> 
> > On Mon, Jun 08, 2015 at 11:51:03AM +0200, Igor Mammedov wrote:
> > > On Mon, 8 Jun 2015 11:28:18 +0530
> > > Bharata B Rao <bharata@linux.vnet.ibm.com> wrote:
> > > 
> > > > On Mon, May 25, 2015 at 02:42:40PM -0300, Eduardo Habkost wrote:
> > > > > On Mon, May 25, 2015 at 01:17:57PM +0530, Bharata B Rao wrote:
> > > > > > On Thu, May 14, 2015 at 11:39:06AM +0200, Paolo Bonzini wrote:
> > > > > > > On 13/05/2015 20:06, Eduardo Habkost wrote:
> > > > > > > > Also, this introduces a circular dependency between pc-dimm.c and
> > > > > > > > numa.c. Instead of that, pc-dimm could simply notify us when a new
> > > > > > > > device is realized (with just (addr, end, node) as arguments), so we can
> > > > > > > > save the list of memory ranges inside struct node_info.
> > > > > > > > 
> > > > > > > > I wonder if the memory API already provides something that would help
> > > > > > > > us. Paolo, do you see a way we could simply use a MemoryRegion as input
> > > > > > > > to lookup the NUMA node?
> > > > > > > 
> > > > > > > No, but I guess you could add a numa_get/set_memory_region_node_id API
> > > > > > > that uses a hash table.  That's a variant of the "pc-dimm could simply
> > > > > > > notify" numa.c that you propose above.
> > > > > > 
> > > > > > While you say we can't use MemoryRegion as input to lookup the NUMA node,
> > > > > > you suggest that we add numa_get/set_memory_region_node_id. Does this API
> > > > > > get/set NUMA node id for the given MemoryRegion ? 
> > > > > 
> > > > > I was going to suggest that, but it would require changing the
> > > > > non-memdev code path to create a MemoryRegion for each node, too. So
> > > > > having a numa_set_mem_node_id(start_addr, end_addr, node_id) API would
> > > > > be simpler.
> > > > 
> > > > In order to save the list of memory ranges inside node_info, I tried this
> > > > approach where I call
> > > > 
> > > > numa_set_mem_node_id(dimm.addr, dimm.size, dimm.node) from
> > > > 
> > > > pc_dimm_realize(), but
> > > > 
> > > > the value of dimm.addr is finalized only later in ->plug().
> > > > 
> > > > So we would have to call this API from arch code like pc_dimm_plug().
> > > > Is that acceptable ?
> > 
> > It looks acceptable to me, as pc.c already has all the rest of the
> > NUMA-specific code for PC. I believe it would be interesting to keep all
> > numa.o dependencies contained inside machine code.
> > 
> > > Could you query pc_dimms' numa property each time you need mapping
> > > instead of additionally storing that mapping elsewhere?
> > 
> > The original patch did that, but I suggested the
> > numa_set_mem_node_id() API for two reasons: 1) not requiring special
> > cases for hotplug inside numa_get_node(); 2) not introducing a circular
> > dependency between pc-dimm.c and numa.c.
> What circular dependency doing foreach(pc-dimm) would introduce?
> So far pc-dimm is independent from numa.c and a regular device with no
> dependencies (except of on backend memdev) and has it's own 'numa' property
> for providing that information to interested users. I'd rather keep
> it separate form legacy numa.c:-numa handling.

pc-dimm.c already depends on numa.c because it checks nb_numa_nodes
inside pc_dimm_realize().

I don't understand what you mean by "legacy numa.c:-numa handling".
Unless there's a way to query pc-dimm (or other code) for all
(address -> numa_node) mappings without a special case for memory
hotplug[1], I wouldn't call node_info[].node_mem "legacy".

[1] And this is exactly what I want to provide with
    numa_set_mem_node_id(): an API that doesn't require special cases
    for memory hotplug.

> 
> The only thing I see for using -numa with -device pc-dimm is to define
> possible fixed memory layout (pc-dimm <-> node mapping) at startup for
> platforms that have to describe it at boot time. And even for that, I'd
> suggest to use something like -numa nodeid=X,addr=Z,size=Y,dimm=ID and have
> board code that does mapping to call numa.c:numa_get_layout_for_device(dimm_id)
> and set received values on pc-dimm properties if plugged in dimm fits them.

I don't understand why you are considering the above. There's no
proposal to change command-line arguments by now.

> 
> What for one needs to pull dimm properties into numa.c?

I am not sure I parsed the question correctly, but:

Original commit message explains why numa_get_node(addr) is needed:

> > This is needed by sPAPR PowerPC to support
> > ibm,dynamic-reconfiguration-memory device tree node which is needed
> > for memory hotplug.

And to make this work, it needs to be aware of NUMA information for
hotplugged memory too.

My proposal is to do that inside the code that actually assigns
addresses to pc-dimm: pc_dimm_realize() (pc.c). So pc-dimm.c and numa.c
won't reference each other, and numa.c won't need any special code for
hotplug.

We could have common helpers later to avoid duplicating the same logic
into other machines, but the point is that we don't need to make numa.c
carry special memory hotplug code, and we don't need to make pc-dimm.c
care about -numa.

>  
> > Having a numa_set_memory_region_node_id(MemoryRegion *mr, int node) API
> > would probably be better, and make the discussion about pc_dimm.addr
> Wouldn't it be layering violation if we put (frontend) nodeid information
> into (backend) MemoryRegion?

We wouldn't, it would be a numa.c function that would just keep track of
the list of MemoryRegions for each NUMA node.

> 
> > moot. But it would require changing
> > memory_region_allocate_system_memory() to avoid
> > allocate_system_memory_nonnuma() even in the !have_memdevs case.
> > 
>
Igor Mammedov June 10, 2015, 9:43 a.m. UTC | #13
On Tue, 9 Jun 2015 09:40:54 -0300
Eduardo Habkost <ehabkost@redhat.com> wrote:

> On Tue, Jun 09, 2015 at 11:23:19AM +0200, Igor Mammedov wrote:
> > On Mon, 8 Jun 2015 12:51:39 -0300
> > Eduardo Habkost <ehabkost@redhat.com> wrote:
> > 
> > > On Mon, Jun 08, 2015 at 11:51:03AM +0200, Igor Mammedov wrote:
> > > > On Mon, 8 Jun 2015 11:28:18 +0530
> > > > Bharata B Rao <bharata@linux.vnet.ibm.com> wrote:
> > > > 
> > > > > On Mon, May 25, 2015 at 02:42:40PM -0300, Eduardo Habkost wrote:
> > > > > > On Mon, May 25, 2015 at 01:17:57PM +0530, Bharata B Rao wrote:
> > > > > > > On Thu, May 14, 2015 at 11:39:06AM +0200, Paolo Bonzini wrote:
> > > > > > > > On 13/05/2015 20:06, Eduardo Habkost wrote:
> > > > > > > > > Also, this introduces a circular dependency between pc-dimm.c and
> > > > > > > > > numa.c. Instead of that, pc-dimm could simply notify us when a new
> > > > > > > > > device is realized (with just (addr, end, node) as arguments), so we can
> > > > > > > > > save the list of memory ranges inside struct node_info.
> > > > > > > > > 
> > > > > > > > > I wonder if the memory API already provides something that would help
> > > > > > > > > us. Paolo, do you see a way we could simply use a MemoryRegion as input
> > > > > > > > > to lookup the NUMA node?
> > > > > > > > 
> > > > > > > > No, but I guess you could add a numa_get/set_memory_region_node_id API
> > > > > > > > that uses a hash table.  That's a variant of the "pc-dimm could simply
> > > > > > > > notify" numa.c that you propose above.
> > > > > > > 
> > > > > > > While you say we can't use MemoryRegion as input to lookup the NUMA node,
> > > > > > > you suggest that we add numa_get/set_memory_region_node_id. Does this API
> > > > > > > get/set NUMA node id for the given MemoryRegion ? 
> > > > > > 
> > > > > > I was going to suggest that, but it would require changing the
> > > > > > non-memdev code path to create a MemoryRegion for each node, too. So
> > > > > > having a numa_set_mem_node_id(start_addr, end_addr, node_id) API would
> > > > > > be simpler.
> > > > > 
> > > > > In order to save the list of memory ranges inside node_info, I tried this
> > > > > approach where I call
> > > > > 
> > > > > numa_set_mem_node_id(dimm.addr, dimm.size, dimm.node) from
> > > > > 
> > > > > pc_dimm_realize(), but
> > > > > 
> > > > > the value of dimm.addr is finalized only later in ->plug().
> > > > > 
> > > > > So we would have to call this API from arch code like pc_dimm_plug().
> > > > > Is that acceptable ?
> > > 
> > > It looks acceptable to me, as pc.c already has all the rest of the
> > > NUMA-specific code for PC. I believe it would be interesting to keep all
> > > numa.o dependencies contained inside machine code.
> > > 
> > > > Could you query pc_dimms' numa property each time you need mapping
> > > > instead of additionally storing that mapping elsewhere?
> > > 
> > > The original patch did that, but I suggested the
> > > numa_set_mem_node_id() API for two reasons: 1) not requiring special
> > > cases for hotplug inside numa_get_node(); 2) not introducing a circular
> > > dependency between pc-dimm.c and numa.c.
> > What circular dependency doing foreach(pc-dimm) would introduce?
> > So far pc-dimm is independent from numa.c and a regular device with no
> > dependencies (except of on backend memdev) and has it's own 'numa' property
> > for providing that information to interested users. I'd rather keep
> > it separate form legacy numa.c:-numa handling.
> 
> pc-dimm.c already depends on numa.c because it checks nb_numa_nodes
> inside pc_dimm_realize().
check should be in pc_dimm_plug() instead of realize but I guess it saves
up duplication when pc-dimm reused with other targets, anyway we could move
it out into generic common function.

> 
> I don't understand what you mean by "legacy numa.c:-numa handling".
> Unless there's a way to query pc-dimm (or other code) for all
> (address -> numa_node) mappings without a special case for memory
> hotplug[1], I wouldn't call node_info[].node_mem "legacy".
> 
> [1] And this is exactly what I want to provide with
>     numa_set_mem_node_id(): an API that doesn't require special cases
>     for memory hotplug.
For x86 board makers usually define address ranges -> node mapping statically.
So node_info[].node_mem  & numa_set_mem_node_id() makes sense.
And sPAPR could probably do the same, no need to scan for pc-dimm devices.

 
[...]
> > What for one needs to pull dimm properties into numa.c?
> 
> I am not sure I parsed the question correctly, but:
> 
> Original commit message explains why numa_get_node(addr) is needed:
> 
> > > This is needed by sPAPR PowerPC to support
> > > ibm,dynamic-reconfiguration-memory device tree node which is needed
> > > for memory hotplug.
> 
> And to make this work, it needs to be aware of NUMA information for
> hotplugged memory too.
I've checked spapr_populate_drconf_memory() from original series,
it needs to be aware at startup about address ranges -> node mapping
including mapping partitioning of whole hotplug memory range
(i.e. not actual hotplugged memory).
-numa node_mem  & numa_set_mem_node_id() are sufficient for this purpose 

> My proposal is to do that inside the code that actually assigns
> addresses to pc-dimm: pc_dimm_realize() (pc.c). So pc-dimm.c and numa.c
> won't reference each other, and numa.c won't need any special code for
> hotplug.
you've probably meant arc_dimm_plug() instead of pc_dimm_realize().
but it shouldn't call numa_set_mem_node_id() since partitioning is static
and is done at startup time. 

> We could have common helpers later to avoid duplicating the same logic
> into other machines, but the point is that we don't need to make numa.c
> carry special memory hotplug code, and we don't need to make pc-dimm.c
> care about -numa.
agreed.

We don't have static partitioning of hotplug memory address space in
x86 target because it limits flexibility of hot-plugging any amount
of memory to any node. (but we could both a static partitioning using
SRAT table and override it _PXM method for dynamic assignment).

The issue with static partitioning is that mgmt tools have to know
addr & size of hotplug memory address space to partition it, but
QEMU doesn't provide that info so far.

> 
> >  
> > > Having a numa_set_memory_region_node_id(MemoryRegion *mr, int node) API
> > > would probably be better, and make the discussion about pc_dimm.addr
> > Wouldn't it be layering violation if we put (frontend) nodeid information
> > into (backend) MemoryRegion?
> 
> We wouldn't, it would be a numa.c function that would just keep track of
> the list of MemoryRegions for each NUMA node.
I sill don't get how idea to use  MemoryRegions applies to above,
you can have MemoryRegions for present memory but there isn't any
for not memory that hasn't been plugged in yet.
numa_set_mem_node_id() looks like a way to go and a simple one at that.

> 
> > 
> > > moot. But it would require changing
> > > memory_region_allocate_system_memory() to avoid
> > > allocate_system_memory_nonnuma() even in the !have_memdevs case.
> > > 
> > 
>
Eduardo Habkost June 10, 2015, 12:14 p.m. UTC | #14
On Wed, Jun 10, 2015 at 11:43:19AM +0200, Igor Mammedov wrote:
> On Tue, 9 Jun 2015 09:40:54 -0300
> Eduardo Habkost <ehabkost@redhat.com> wrote:
> 
> > On Tue, Jun 09, 2015 at 11:23:19AM +0200, Igor Mammedov wrote:
> > > On Mon, 8 Jun 2015 12:51:39 -0300
> > > Eduardo Habkost <ehabkost@redhat.com> wrote:
> > > 
> > > > On Mon, Jun 08, 2015 at 11:51:03AM +0200, Igor Mammedov wrote:
> > > > > On Mon, 8 Jun 2015 11:28:18 +0530
> > > > > Bharata B Rao <bharata@linux.vnet.ibm.com> wrote:
> > > > > 
> > > > > > On Mon, May 25, 2015 at 02:42:40PM -0300, Eduardo Habkost wrote:
> > > > > > > On Mon, May 25, 2015 at 01:17:57PM +0530, Bharata B Rao wrote:
> > > > > > > > On Thu, May 14, 2015 at 11:39:06AM +0200, Paolo Bonzini wrote:
> > > > > > > > > On 13/05/2015 20:06, Eduardo Habkost wrote:
> > > > > > > > > > Also, this introduces a circular dependency between pc-dimm.c and
> > > > > > > > > > numa.c. Instead of that, pc-dimm could simply notify us when a new
> > > > > > > > > > device is realized (with just (addr, end, node) as arguments), so we can
> > > > > > > > > > save the list of memory ranges inside struct node_info.
> > > > > > > > > > 
> > > > > > > > > > I wonder if the memory API already provides something that would help
> > > > > > > > > > us. Paolo, do you see a way we could simply use a MemoryRegion as input
> > > > > > > > > > to lookup the NUMA node?
> > > > > > > > > 
> > > > > > > > > No, but I guess you could add a numa_get/set_memory_region_node_id API
> > > > > > > > > that uses a hash table.  That's a variant of the "pc-dimm could simply
> > > > > > > > > notify" numa.c that you propose above.
> > > > > > > > 
> > > > > > > > While you say we can't use MemoryRegion as input to lookup the NUMA node,
> > > > > > > > you suggest that we add numa_get/set_memory_region_node_id. Does this API
> > > > > > > > get/set NUMA node id for the given MemoryRegion ? 
> > > > > > > 
> > > > > > > I was going to suggest that, but it would require changing the
> > > > > > > non-memdev code path to create a MemoryRegion for each node, too. So
> > > > > > > having a numa_set_mem_node_id(start_addr, end_addr, node_id) API would
> > > > > > > be simpler.
> > > > > > 
> > > > > > In order to save the list of memory ranges inside node_info, I tried this
> > > > > > approach where I call
> > > > > > 
> > > > > > numa_set_mem_node_id(dimm.addr, dimm.size, dimm.node) from
> > > > > > 
> > > > > > pc_dimm_realize(), but
> > > > > > 
> > > > > > the value of dimm.addr is finalized only later in ->plug().
> > > > > > 
> > > > > > So we would have to call this API from arch code like pc_dimm_plug().
> > > > > > Is that acceptable ?
> > > > 
> > > > It looks acceptable to me, as pc.c already has all the rest of the
> > > > NUMA-specific code for PC. I believe it would be interesting to keep all
> > > > numa.o dependencies contained inside machine code.
> > > > 
> > > > > Could you query pc_dimms' numa property each time you need mapping
> > > > > instead of additionally storing that mapping elsewhere?
> > > > 
> > > > The original patch did that, but I suggested the
> > > > numa_set_mem_node_id() API for two reasons: 1) not requiring special
> > > > cases for hotplug inside numa_get_node(); 2) not introducing a circular
> > > > dependency between pc-dimm.c and numa.c.
> > > What circular dependency doing foreach(pc-dimm) would introduce?
> > > So far pc-dimm is independent from numa.c and a regular device with no
> > > dependencies (except of on backend memdev) and has it's own 'numa' property
> > > for providing that information to interested users. I'd rather keep
> > > it separate form legacy numa.c:-numa handling.
> > 
> > pc-dimm.c already depends on numa.c because it checks nb_numa_nodes
> > inside pc_dimm_realize().
> check should be in pc_dimm_plug() instead of realize but I guess it saves
> up duplication when pc-dimm reused with other targets, anyway we could move
> it out into generic common function.

Agreed.

> 
> > 
> > I don't understand what you mean by "legacy numa.c:-numa handling".
> > Unless there's a way to query pc-dimm (or other code) for all
> > (address -> numa_node) mappings without a special case for memory
> > hotplug[1], I wouldn't call node_info[].node_mem "legacy".
> > 
> > [1] And this is exactly what I want to provide with
> >     numa_set_mem_node_id(): an API that doesn't require special cases
> >     for memory hotplug.
> For x86 board makers usually define address ranges -> node mapping statically.
> So node_info[].node_mem  & numa_set_mem_node_id() makes sense.
> And sPAPR could probably do the same, no need to scan for pc-dimm devices.

OK, so we are on the same page now.

> 
>  
> [...]
> > > What for one needs to pull dimm properties into numa.c?
> > 
> > I am not sure I parsed the question correctly, but:
> > 
> > Original commit message explains why numa_get_node(addr) is needed:
> > 
> > > > This is needed by sPAPR PowerPC to support
> > > > ibm,dynamic-reconfiguration-memory device tree node which is needed
> > > > for memory hotplug.
> > 
> > And to make this work, it needs to be aware of NUMA information for
> > hotplugged memory too.
> I've checked spapr_populate_drconf_memory() from original series,
> it needs to be aware at startup about address ranges -> node mapping
> including mapping partitioning of whole hotplug memory range
> (i.e. not actual hotplugged memory).
> -numa node_mem  & numa_set_mem_node_id() are sufficient for this purpose 

Good. :)

> 
> > My proposal is to do that inside the code that actually assigns
> > addresses to pc-dimm: pc_dimm_realize() (pc.c). So pc-dimm.c and numa.c
> > won't reference each other, and numa.c won't need any special code for
> > hotplug.
> you've probably meant arc_dimm_plug() instead of pc_dimm_realize().
> but it shouldn't call numa_set_mem_node_id() since partitioning is static
> and is done at startup time. 

Sorry, I meant pc_dimm_plug().

I didn't know partitioning was static. I thought it was dynamic and
defined by the "node" property on pc-dimm. (But I am confused by what
you say below).


> 
> > We could have common helpers later to avoid duplicating the same logic
> > into other machines, but the point is that we don't need to make numa.c
> > carry special memory hotplug code, and we don't need to make pc-dimm.c
> > care about -numa.
> agreed.
> 
> We don't have static partitioning of hotplug memory address space in
> x86 target because it limits flexibility of hot-plugging any amount
> of memory to any node. (but we could both a static partitioning using
> SRAT table and override it _PXM method for dynamic assignment).
> 
> The issue with static partitioning is that mgmt tools have to know
> addr & size of hotplug memory address space to partition it, but
> QEMU doesn't provide that info so far.

OK, so we agree about the numa_set_mem_node_id() API, but I am confused
by what you said about static partitioning. Didn't you just say above
that partitioning is static and done at startup time?

> 
> > 
> > >  
> > > > Having a numa_set_memory_region_node_id(MemoryRegion *mr, int node) API
> > > > would probably be better, and make the discussion about pc_dimm.addr
> > > Wouldn't it be layering violation if we put (frontend) nodeid information
> > > into (backend) MemoryRegion?
> > 
> > We wouldn't, it would be a numa.c function that would just keep track of
> > the list of MemoryRegions for each NUMA node.
> I sill don't get how idea to use  MemoryRegions applies to above,

I was just considering getting a MemoryRegion pointer as argument
instead of (addr, length), all the rest would be the same. But:

> you can have MemoryRegions for present memory but there isn't any
> for not memory that hasn't been plugged in yet.
> numa_set_mem_node_id() looks like a way to go and a simple one at that.

I believe numa_get_node() will be expected to return info only for
memory that was already plugged. But if some machines make it return
valid info for unplugged memory also, it would be a nice extra feature.
As we don't have separate MemoryRegions for the still-unplugged areas
(yet?), your point seems to be valid.
Bharata B Rao June 10, 2015, 12:50 p.m. UTC | #15
On Wed, Jun 10, 2015 at 11:43:19AM +0200, Igor Mammedov wrote:
> On Tue, 9 Jun 2015 09:40:54 -0300
> Eduardo Habkost <ehabkost@redhat.com> wrote:
> 
> > On Tue, Jun 09, 2015 at 11:23:19AM +0200, Igor Mammedov wrote:
> > > On Mon, 8 Jun 2015 12:51:39 -0300
> > > Eduardo Habkost <ehabkost@redhat.com> wrote:
> > > 
> > > > On Mon, Jun 08, 2015 at 11:51:03AM +0200, Igor Mammedov wrote:
> > > > > On Mon, 8 Jun 2015 11:28:18 +0530
> > > > > Bharata B Rao <bharata@linux.vnet.ibm.com> wrote:
> > > > > 
> > > > > > On Mon, May 25, 2015 at 02:42:40PM -0300, Eduardo Habkost wrote:
> > > > > > > On Mon, May 25, 2015 at 01:17:57PM +0530, Bharata B Rao wrote:
> > > > > > > > On Thu, May 14, 2015 at 11:39:06AM +0200, Paolo Bonzini wrote:
> > > > > > > > > On 13/05/2015 20:06, Eduardo Habkost wrote:
> > > > > > > > > > Also, this introduces a circular dependency between pc-dimm.c and
> > > > > > > > > > numa.c. Instead of that, pc-dimm could simply notify us when a new
> > > > > > > > > > device is realized (with just (addr, end, node) as arguments), so we can
> > > > > > > > > > save the list of memory ranges inside struct node_info.
> > > > > > > > > > 
> > > > > > > > > > I wonder if the memory API already provides something that would help
> > > > > > > > > > us. Paolo, do you see a way we could simply use a MemoryRegion as input
> > > > > > > > > > to lookup the NUMA node?
> > > > > > > > > 
> > > > > > > > > No, but I guess you could add a numa_get/set_memory_region_node_id API
> > > > > > > > > that uses a hash table.  That's a variant of the "pc-dimm could simply
> > > > > > > > > notify" numa.c that you propose above.
> > > > > > > > 
> > > > > > > > While you say we can't use MemoryRegion as input to lookup the NUMA node,
> > > > > > > > you suggest that we add numa_get/set_memory_region_node_id. Does this API
> > > > > > > > get/set NUMA node id for the given MemoryRegion ? 
> > > > > > > 
> > > > > > > I was going to suggest that, but it would require changing the
> > > > > > > non-memdev code path to create a MemoryRegion for each node, too. So
> > > > > > > having a numa_set_mem_node_id(start_addr, end_addr, node_id) API would
> > > > > > > be simpler.
> > > > > > 
> > > > > > In order to save the list of memory ranges inside node_info, I tried this
> > > > > > approach where I call
> > > > > > 
> > > > > > numa_set_mem_node_id(dimm.addr, dimm.size, dimm.node) from
> > > > > > 
> > > > > > pc_dimm_realize(), but
> > > > > > 
> > > > > > the value of dimm.addr is finalized only later in ->plug().
> > > > > > 
> > > > > > So we would have to call this API from arch code like pc_dimm_plug().
> > > > > > Is that acceptable ?
> > > > 
> > > > It looks acceptable to me, as pc.c already has all the rest of the
> > > > NUMA-specific code for PC. I believe it would be interesting to keep all
> > > > numa.o dependencies contained inside machine code.
> > > > 
> > > > > Could you query pc_dimms' numa property each time you need mapping
> > > > > instead of additionally storing that mapping elsewhere?
> > > > 
> > > > The original patch did that, but I suggested the
> > > > numa_set_mem_node_id() API for two reasons: 1) not requiring special
> > > > cases for hotplug inside numa_get_node(); 2) not introducing a circular
> > > > dependency between pc-dimm.c and numa.c.
> > > What circular dependency doing foreach(pc-dimm) would introduce?
> > > So far pc-dimm is independent from numa.c and a regular device with no
> > > dependencies (except of on backend memdev) and has it's own 'numa' property
> > > for providing that information to interested users. I'd rather keep
> > > it separate form legacy numa.c:-numa handling.
> > 
> > pc-dimm.c already depends on numa.c because it checks nb_numa_nodes
> > inside pc_dimm_realize().
> check should be in pc_dimm_plug() instead of realize but I guess it saves
> up duplication when pc-dimm reused with other targets, anyway we could move
> it out into generic common function.
> 
> > 
> > I don't understand what you mean by "legacy numa.c:-numa handling".
> > Unless there's a way to query pc-dimm (or other code) for all
> > (address -> numa_node) mappings without a special case for memory
> > hotplug[1], I wouldn't call node_info[].node_mem "legacy".
> > 
> > [1] And this is exactly what I want to provide with
> >     numa_set_mem_node_id(): an API that doesn't require special cases
> >     for memory hotplug.
> For x86 board makers usually define address ranges -> node mapping statically.
> So node_info[].node_mem  & numa_set_mem_node_id() makes sense.
> And sPAPR could probably do the same, no need to scan for pc-dimm devices.

I thought numa_info[i].node_mem maintains memory size information only
for those node memories that are defined at boot time.

If I have

-m 8G,slots=16,maxmem=16G -numa node,mem=4G -numa node,mem=4G

there will be numa_info[0 & 1] for two nodes with 4G size each. However
the remaining 8G of hotpluggable memory isn't covered by this and chunks
of memory from this range can be hotplugged to any node using node= property
of pc-dimm device.

With this understanding, I think numa_set_mem_node_id() when called
from pc_dimm_plug() could note/store the address range for the hotpluggged
pc-dimm in corresponding numa_info[i]. Later this information can be
used to lookup the node by address.

> 
> 
> [...]
> > > What for one needs to pull dimm properties into numa.c?
> > 
> > I am not sure I parsed the question correctly, but:
> > 
> > Original commit message explains why numa_get_node(addr) is needed:
> > 
> > > > This is needed by sPAPR PowerPC to support
> > > > ibm,dynamic-reconfiguration-memory device tree node which is needed
> > > > for memory hotplug.
> > 
> > And to make this work, it needs to be aware of NUMA information for
> > hotplugged memory too.
> I've checked spapr_populate_drconf_memory() from original series,
> it needs to be aware at startup about address ranges -> node mapping
> including mapping partitioning of whole hotplug memory range
> (i.e. not actual hotplugged memory).
> -numa node_mem  & numa_set_mem_node_id() are sufficient for this purpose 

spapr_populate_drconf_memory() needs to know about node information for
boot time memory as well as the hotplugged pc-dimm memory. Since chunks
of hotplug memory range could be plugged into any node, we need to
be able to locate the node id for such memory range. This is where
numa_set_mem_node_id() call for each realized dimm will help.

Regards,
Bharata.
Igor Mammedov June 11, 2015, 6:56 a.m. UTC | #16
On Wed, 10 Jun 2015 18:20:53 +0530
Bharata B Rao <bharata@linux.vnet.ibm.com> wrote:

> On Wed, Jun 10, 2015 at 11:43:19AM +0200, Igor Mammedov wrote:
> > On Tue, 9 Jun 2015 09:40:54 -0300
> > Eduardo Habkost <ehabkost@redhat.com> wrote:
> > 
> > > On Tue, Jun 09, 2015 at 11:23:19AM +0200, Igor Mammedov wrote:
> > > > On Mon, 8 Jun 2015 12:51:39 -0300
> > > > Eduardo Habkost <ehabkost@redhat.com> wrote:
> > > > 
> > > > > On Mon, Jun 08, 2015 at 11:51:03AM +0200, Igor Mammedov wrote:
> > > > > > On Mon, 8 Jun 2015 11:28:18 +0530
> > > > > > Bharata B Rao <bharata@linux.vnet.ibm.com> wrote:
> > > > > > 
> > > > > > > On Mon, May 25, 2015 at 02:42:40PM -0300, Eduardo Habkost wrote:
> > > > > > > > On Mon, May 25, 2015 at 01:17:57PM +0530, Bharata B Rao wrote:
> > > > > > > > > On Thu, May 14, 2015 at 11:39:06AM +0200, Paolo Bonzini wrote:
> > > > > > > > > > On 13/05/2015 20:06, Eduardo Habkost wrote:
> > > > > > > > > > > Also, this introduces a circular dependency between pc-dimm.c and
> > > > > > > > > > > numa.c. Instead of that, pc-dimm could simply notify us when a new
> > > > > > > > > > > device is realized (with just (addr, end, node) as arguments), so we can
> > > > > > > > > > > save the list of memory ranges inside struct node_info.
> > > > > > > > > > > 
> > > > > > > > > > > I wonder if the memory API already provides something that would help
> > > > > > > > > > > us. Paolo, do you see a way we could simply use a MemoryRegion as input
> > > > > > > > > > > to lookup the NUMA node?
> > > > > > > > > > 
> > > > > > > > > > No, but I guess you could add a numa_get/set_memory_region_node_id API
> > > > > > > > > > that uses a hash table.  That's a variant of the "pc-dimm could simply
> > > > > > > > > > notify" numa.c that you propose above.
> > > > > > > > > 
> > > > > > > > > While you say we can't use MemoryRegion as input to lookup the NUMA node,
> > > > > > > > > you suggest that we add numa_get/set_memory_region_node_id. Does this API
> > > > > > > > > get/set NUMA node id for the given MemoryRegion ? 
> > > > > > > > 
> > > > > > > > I was going to suggest that, but it would require changing the
> > > > > > > > non-memdev code path to create a MemoryRegion for each node, too. So
> > > > > > > > having a numa_set_mem_node_id(start_addr, end_addr, node_id) API would
> > > > > > > > be simpler.
> > > > > > > 
> > > > > > > In order to save the list of memory ranges inside node_info, I tried this
> > > > > > > approach where I call
> > > > > > > 
> > > > > > > numa_set_mem_node_id(dimm.addr, dimm.size, dimm.node) from
> > > > > > > 
> > > > > > > pc_dimm_realize(), but
> > > > > > > 
> > > > > > > the value of dimm.addr is finalized only later in ->plug().
> > > > > > > 
> > > > > > > So we would have to call this API from arch code like pc_dimm_plug().
> > > > > > > Is that acceptable ?
> > > > > 
> > > > > It looks acceptable to me, as pc.c already has all the rest of the
> > > > > NUMA-specific code for PC. I believe it would be interesting to keep all
> > > > > numa.o dependencies contained inside machine code.
> > > > > 
> > > > > > Could you query pc_dimms' numa property each time you need mapping
> > > > > > instead of additionally storing that mapping elsewhere?
> > > > > 
> > > > > The original patch did that, but I suggested the
> > > > > numa_set_mem_node_id() API for two reasons: 1) not requiring special
> > > > > cases for hotplug inside numa_get_node(); 2) not introducing a circular
> > > > > dependency between pc-dimm.c and numa.c.
> > > > What circular dependency doing foreach(pc-dimm) would introduce?
> > > > So far pc-dimm is independent from numa.c and a regular device with no
> > > > dependencies (except of on backend memdev) and has it's own 'numa' property
> > > > for providing that information to interested users. I'd rather keep
> > > > it separate form legacy numa.c:-numa handling.
> > > 
> > > pc-dimm.c already depends on numa.c because it checks nb_numa_nodes
> > > inside pc_dimm_realize().
> > check should be in pc_dimm_plug() instead of realize but I guess it saves
> > up duplication when pc-dimm reused with other targets, anyway we could move
> > it out into generic common function.
> > 
> > > 
> > > I don't understand what you mean by "legacy numa.c:-numa handling".
> > > Unless there's a way to query pc-dimm (or other code) for all
> > > (address -> numa_node) mappings without a special case for memory
> > > hotplug[1], I wouldn't call node_info[].node_mem "legacy".
> > > 
> > > [1] And this is exactly what I want to provide with
> > >     numa_set_mem_node_id(): an API that doesn't require special cases
> > >     for memory hotplug.
> > For x86 board makers usually define address ranges -> node mapping statically.
> > So node_info[].node_mem  & numa_set_mem_node_id() makes sense.
> > And sPAPR could probably do the same, no need to scan for pc-dimm devices.
> 
> I thought numa_info[i].node_mem maintains memory size information only
> for those node memories that are defined at boot time.
> 
> If I have
> 
> -m 8G,slots=16,maxmem=16G -numa node,mem=4G -numa node,mem=4G
> 
> there will be numa_info[0 & 1] for two nodes with 4G size each. However
> the remaining 8G of hotpluggable memory isn't covered by this and chunks
> of memory from this range can be hotplugged to any node using node= property
> of pc-dimm device.
> 
> With this understanding, I think numa_set_mem_node_id() when called
> from pc_dimm_plug() could note/store the address range for the hotpluggged
> pc-dimm in corresponding numa_info[i]. Later this information can be
> used to lookup the node by address.
> 
> > 
> > 
> > [...]
> > > > What for one needs to pull dimm properties into numa.c?
> > > 
> > > I am not sure I parsed the question correctly, but:
> > > 
> > > Original commit message explains why numa_get_node(addr) is needed:
> > > 
> > > > > This is needed by sPAPR PowerPC to support
> > > > > ibm,dynamic-reconfiguration-memory device tree node which is needed
> > > > > for memory hotplug.
> > > 
> > > And to make this work, it needs to be aware of NUMA information for
> > > hotplugged memory too.
> > I've checked spapr_populate_drconf_memory() from original series,
> > it needs to be aware at startup about address ranges -> node mapping
> > including mapping partitioning of whole hotplug memory range
> > (i.e. not actual hotplugged memory).
> > -numa node_mem  & numa_set_mem_node_id() are sufficient for this purpose 
> 
> spapr_populate_drconf_memory() needs to know about node information for
> boot time memory as well as the hotplugged pc-dimm memory. Since chunks
> of hotplug memory range could be plugged into any node, we need to
> be able to locate the node id for such memory range. This is where
> numa_set_mem_node_id() call for each realized dimm will help.
So you are saying that spapr_populate_drconf_memory() doesn't need to know
in advance about unplugged memory ranges and could be updated at runtime.
(I've thought that device tree is build only at boot and guest can't
accept dynamic updates to it, therefore you'd need provide addr -> node_id
mapping at boot time including for not yet plugged memory).

> 
> Regards,
> Bharata.
> 
>
Bharata B Rao June 11, 2015, 7:04 a.m. UTC | #17
On Thu, Jun 11, 2015 at 08:56:03AM +0200, Igor Mammedov wrote:
<snip>
> > > > And to make this work, it needs to be aware of NUMA information for
> > > > hotplugged memory too.
> > > I've checked spapr_populate_drconf_memory() from original series,
> > > it needs to be aware at startup about address ranges -> node mapping
> > > including mapping partitioning of whole hotplug memory range
> > > (i.e. not actual hotplugged memory).
> > > -numa node_mem  & numa_set_mem_node_id() are sufficient for this purpose 
> > 
> > spapr_populate_drconf_memory() needs to know about node information for
> > boot time memory as well as the hotplugged pc-dimm memory. Since chunks
> > of hotplug memory range could be plugged into any node, we need to
> > be able to locate the node id for such memory range. This is where
> > numa_set_mem_node_id() call for each realized dimm will help.
> So you are saying that spapr_populate_drconf_memory() doesn't need to know
> in advance about unplugged memory ranges and could be updated at runtime.
> (I've thought that device tree is build only at boot and guest can't
> accept dynamic updates to it, therefore you'd need provide addr -> node_id
> mapping at boot time including for not yet plugged memory).

Here are we dynamically adding a device tree node at runtime when guest
issues ibm,architecture-client-support call during early boot. Guest firmware
(SLOF) has already been updated to support such dynamic update.

During hotplug the node id information is also updated in
ibm,dynamic-memory property that is present under this device tree node.

Regards,
Bharata.
diff mbox

Patch

diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 6523b4d..19c0ba3 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -15,11 +15,14 @@  typedef struct node_info {
     DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
     struct HostMemoryBackend *node_memdev;
     bool present;
+    ram_addr_t mem_start;
+    ram_addr_t mem_end;
 } NodeInfo;
 extern NodeInfo numa_info[MAX_NODES];
 void parse_numa_opts(MachineClass *mc);
 void numa_post_machine_init(void);
 void query_numa_node_mem(uint64_t node_mem[]);
 extern QemuOptsList qemu_numa_opts;
+uint32_t numa_get_node(ram_addr_t addr, Error **errp);
 
 #endif
diff --git a/numa.c b/numa.c
index c975fb2..fdf333b 100644
--- a/numa.c
+++ b/numa.c
@@ -53,6 +53,63 @@  static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
 int nb_numa_nodes;
 NodeInfo numa_info[MAX_NODES];
 
+/*
+ * Given an address, return the index of the NUMA node to which the
+ * address belongs to.
+ */
+uint32_t numa_get_node(ram_addr_t addr, Error **errp)
+{
+    uint32_t i;
+    MemoryDeviceInfoList *info_list = NULL;
+    MemoryDeviceInfoList **prev = &info_list;
+    MemoryDeviceInfoList *info;
+
+    for (i = 0; i < nb_numa_nodes; i++) {
+        if (addr >= numa_info[i].mem_start && addr < numa_info[i].mem_end) {
+            return i;
+        }
+    }
+
+    /*
+     * If this @addr falls under cold or hotplugged memory regions,
+     * check there too.
+     */
+    qmp_pc_dimm_device_list(qdev_get_machine(), &prev);
+    for (info = info_list; info; info = info->next) {
+        MemoryDeviceInfo *value = info->value;
+
+        if (value) {
+            switch (value->kind) {
+            case MEMORY_DEVICE_INFO_KIND_DIMM:
+                if (addr >= value->dimm->addr &&
+                        addr < (value->dimm->addr + value->dimm->size)) {
+                    qapi_free_MemoryDeviceInfoList(info_list);
+                    return value->dimm->node;
+                }
+                break;
+            default:
+                break;
+            }
+        }
+    }
+    qapi_free_MemoryDeviceInfoList(info_list);
+    error_setg(errp, "Address 0x" RAM_ADDR_FMT " doesn't belong to any "
+                "NUMA node", addr);
+
+    return -1;
+}
+
+static void numa_set_mem_address(int nodenr)
+{
+    if (nodenr) {
+        numa_info[nodenr].mem_start = numa_info[nodenr-1].mem_end;
+    } else {
+        numa_info[nodenr].mem_start = 0;
+    }
+    numa_info[nodenr].mem_end = numa_info[nodenr].mem_start +
+                                   numa_info[nodenr].node_mem;
+}
+
 static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
 {
     uint16_t nodenr;
@@ -276,6 +333,10 @@  void parse_numa_opts(MachineClass *mc)
         }
 
         for (i = 0; i < nb_numa_nodes; i++) {
+            numa_set_mem_address(i);
+        }
+
+        for (i = 0; i < nb_numa_nodes; i++) {
             if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
                 break;
             }