diff mbox series

[RFC,8/8] powerpc/papr_scm: Use FORM2 associativity details

Message ID 20210614164003.196094-9-aneesh.kumar@linux.ibm.com (mailing list archive)
State Changes Requested
Headers show
Series Add support for FORM2 associativity | expand
Related show

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (419dfbc3e05d80c5f6d6856534cd0a21c22c22de)
snowpatch_ozlabs/build-ppc64le warning Build succeeded but added 2 new sparse warnings
snowpatch_ozlabs/build-ppc64be warning Build succeeded but added 2 new sparse warnings
snowpatch_ozlabs/build-ppc64e success Build succeeded
snowpatch_ozlabs/build-pmac32 success Build succeeded
snowpatch_ozlabs/checkpatch success total: 0 errors, 0 warnings, 0 checks, 111 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Aneesh Kumar K V June 14, 2021, 4:40 p.m. UTC
FORM2 introduce a concept of secondary domain which is identical to the
conceept of FORM1 primary domain. Use secondary domain as the numa node
when using persistent memory device. For DAX kmem use the logical domain
id introduced in FORM2. This new numa node

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
 arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
 arch/powerpc/platforms/pseries/pseries.h  |  1 +
 3 files changed, 45 insertions(+), 10 deletions(-)

Comments

David Gibson June 15, 2021, 3:55 a.m. UTC | #1
On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> FORM2 introduce a concept of secondary domain which is identical to the
> conceept of FORM1 primary domain. Use secondary domain as the numa node
> when using persistent memory device. For DAX kmem use the logical domain
> id introduced in FORM2. This new numa node
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
>  3 files changed, 45 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 86cd2af014f7..b9ac6d02e944 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>  	return nid;
>  }
>  
> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
> +{
> +	int secondary_index;
> +	const __be32 *associativity;
> +
> +	if (!numa_enabled) {
> +		*primary = NUMA_NO_NODE;
> +		*secondary = NUMA_NO_NODE;
> +		return 0;
> +	}
> +
> +	associativity = of_get_associativity(node);
> +	if (!associativity)
> +		return -ENODEV;
> +
> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
> +		secondary_index = of_read_number(&distance_ref_points[1], 1);

Secondary ID is always the second reference point, but primary depends
on the length of resources?  That seems very weird.

> +		*secondary = of_read_number(&associativity[secondary_index], 1);
> +	}
> +	if (*primary == 0xffff || *primary >= nr_node_ids)
> +		*primary = NUMA_NO_NODE;
> +
> +	if (*secondary == 0xffff || *secondary >= nr_node_ids)
> +		*secondary = NUMA_NO_NODE;
> +	return 0;
> +}
> +
>  /* Returns the nid associated with the given device tree node,
>   * or -1 if not found.
>   */
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
> index ef26fe40efb0..9bf2f1f3ddc5 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -18,6 +18,7 @@
>  #include <asm/plpar_wrappers.h>
>  #include <asm/papr_pdsm.h>
>  #include <asm/mce.h>
> +#include "pseries.h"
>  
>  #define BIND_ANY_ADDR (~0ul)
>  
> @@ -88,6 +89,8 @@ struct papr_scm_perf_stats {
>  struct papr_scm_priv {
>  	struct platform_device *pdev;
>  	struct device_node *dn;
> +	int numa_node;
> +	int target_node;
>  	uint32_t drc_index;
>  	uint64_t blocks;
>  	uint64_t block_size;
> @@ -923,7 +926,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  	struct nd_mapping_desc mapping;
>  	struct nd_region_desc ndr_desc;
>  	unsigned long dimm_flags;
> -	int target_nid, online_nid;
>  	ssize_t stat_size;
>  
>  	p->bus_desc.ndctl = papr_scm_ndctl;
> @@ -974,10 +976,8 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  	mapping.size = p->blocks * p->block_size; // XXX: potential overflow?
>  
>  	memset(&ndr_desc, 0, sizeof(ndr_desc));
> -	target_nid = dev_to_node(&p->pdev->dev);
> -	online_nid = numa_map_to_online_node(target_nid);
> -	ndr_desc.numa_node = online_nid;
> -	ndr_desc.target_node = target_nid;
> +	ndr_desc.numa_node = p->numa_node;
> +	ndr_desc.target_node = p->target_node;
>  	ndr_desc.res = &p->res;
>  	ndr_desc.of_node = p->dn;
>  	ndr_desc.provider_data = p;
> @@ -1001,9 +1001,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  				ndr_desc.res, p->dn);
>  		goto err;
>  	}
> -	if (target_nid != online_nid)
> -		dev_info(dev, "Region registered with target node %d and online node %d",
> -			 target_nid, online_nid);
>  
>  	mutex_lock(&papr_ndr_lock);
>  	list_add_tail(&p->region_list, &papr_nd_regions);
> @@ -1096,7 +1093,7 @@ static int papr_scm_probe(struct platform_device *pdev)
>  	struct papr_scm_priv *p;
>  	const char *uuid_str;
>  	u64 uuid[2];
> -	int rc;
> +	int rc, numa_node;
>  
>  	/* check we have all the required DT properties */
>  	if (of_property_read_u32(dn, "ibm,my-drc-index", &drc_index)) {
> @@ -1119,11 +1116,20 @@ static int papr_scm_probe(struct platform_device *pdev)
>  		return -ENODEV;
>  	}
>  
> -
>  	p = kzalloc(sizeof(*p), GFP_KERNEL);
>  	if (!p)
>  		return -ENOMEM;
>  
> +	if (get_primary_and_secondary_domain(dn, &p->target_node, &numa_node)) {
> +		dev_err(&pdev->dev, "%pOF: missing NUMA attributes!\n", dn);
> +		rc = -ENODEV;
> +		goto err;
> +	}
> +	p->numa_node = numa_map_to_online_node(numa_node);
> +	if (numa_node != p->numa_node)
> +		dev_info(&pdev->dev, "Region registered with online node %d and device tree node %d",
> +			 p->numa_node, numa_node);
> +
>  	/* Initialize the dimm mutex */
>  	mutex_init(&p->health_mutex);
>  
> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> index 663a0859cf13..9c2a1fc9ded1 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -114,4 +114,5 @@ void pseries_setup_security_mitigations(void);
>  void pseries_lpar_read_hblkrm_characteristics(void);
>  
>  void update_numa_distance(struct device_node *node);
> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary);
>  #endif /* _PSERIES_PSERIES_H */
Aneesh Kumar K V June 15, 2021, 5:57 a.m. UTC | #2
David Gibson <david@gibson.dropbear.id.au> writes:

> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>> FORM2 introduce a concept of secondary domain which is identical to the
>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>> when using persistent memory device. For DAX kmem use the logical domain
>> id introduced in FORM2. This new numa node
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>  3 files changed, 45 insertions(+), 10 deletions(-)
>> 
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index 86cd2af014f7..b9ac6d02e944 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>>  	return nid;
>>  }
>>  
>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>> +{
>> +	int secondary_index;
>> +	const __be32 *associativity;
>> +
>> +	if (!numa_enabled) {
>> +		*primary = NUMA_NO_NODE;
>> +		*secondary = NUMA_NO_NODE;
>> +		return 0;
>> +	}
>> +
>> +	associativity = of_get_associativity(node);
>> +	if (!associativity)
>> +		return -ENODEV;
>> +
>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>
> Secondary ID is always the second reference point, but primary depends
> on the length of resources?  That seems very weird.

primary_domain_index is distance_ref_point[0]. With Form2 we would find
both primary and secondary domain ID same for all resources other than
persistent memory device. The usage w.r.t. persistent memory is
explained in patch 7.

With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
the kernel should use when using persistent memory devices. Persistent memory devices
can also be used as regular memory using DAX KMEM driver and primary domainID indicates
the numa node number OS should use when using these devices as regular memory. Secondary
domainID is the numa node number that should be used when using this device as
persistent memory. In the later case, we are interested in the locality of the
device to an established numa node. In the above example, if the last row represents a
persistent memory device/resource, NUMA node number 40 will be used when using the device
as regular memory and NUMA node number 0 will be the device numa node when using it as
a persistent memory device.


>
>> +		*secondary = of_read_number(&associativity[secondary_index], 1);
>> +	}
>> +	if (*primary == 0xffff || *primary >= nr_node_ids)
>> +		*primary = NUMA_NO_NODE;
>> +
>> +	if (*secondary == 0xffff || *secondary >= nr_node_ids)
>> +		*secondary = NUMA_NO_NODE;
>> +	return 0;
>> +}
>> +
>>  /* Returns the nid associated with the given device tree node,
>>   * or -1 if not found.
>>   */
>> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
>> index ef26fe40efb0..9bf2f1f3ddc5 100644
>> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
>> @@ -18,6 +18,7 @@
>>  #include <asm/plpar_wrappers.h>
>>  #include <asm/papr_pdsm.h>
>>  #include <asm/mce.h>
>> +#include "pseries.h"
>>  
>>  #define BIND_ANY_ADDR (~0ul)
>>  
>> @@ -88,6 +89,8 @@ struct papr_scm_perf_stats {
>>  struct papr_scm_priv {
>>  	struct platform_device *pdev;
>>  	struct device_node *dn;
>> +	int numa_node;
>> +	int target_node;
>>  	uint32_t drc_index;
>>  	uint64_t blocks;
>>  	uint64_t block_size;
>> @@ -923,7 +926,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  	struct nd_mapping_desc mapping;
>>  	struct nd_region_desc ndr_desc;
>>  	unsigned long dimm_flags;
>> -	int target_nid, online_nid;
>>  	ssize_t stat_size;
>>  
>>  	p->bus_desc.ndctl = papr_scm_ndctl;
>> @@ -974,10 +976,8 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  	mapping.size = p->blocks * p->block_size; // XXX: potential overflow?
>>  
>>  	memset(&ndr_desc, 0, sizeof(ndr_desc));
>> -	target_nid = dev_to_node(&p->pdev->dev);
>> -	online_nid = numa_map_to_online_node(target_nid);
>> -	ndr_desc.numa_node = online_nid;
>> -	ndr_desc.target_node = target_nid;
>> +	ndr_desc.numa_node = p->numa_node;
>> +	ndr_desc.target_node = p->target_node;
>>  	ndr_desc.res = &p->res;
>>  	ndr_desc.of_node = p->dn;
>>  	ndr_desc.provider_data = p;
>> @@ -1001,9 +1001,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  				ndr_desc.res, p->dn);
>>  		goto err;
>>  	}
>> -	if (target_nid != online_nid)
>> -		dev_info(dev, "Region registered with target node %d and online node %d",
>> -			 target_nid, online_nid);
>>  
>>  	mutex_lock(&papr_ndr_lock);
>>  	list_add_tail(&p->region_list, &papr_nd_regions);
>> @@ -1096,7 +1093,7 @@ static int papr_scm_probe(struct platform_device *pdev)
>>  	struct papr_scm_priv *p;
>>  	const char *uuid_str;
>>  	u64 uuid[2];
>> -	int rc;
>> +	int rc, numa_node;
>>  
>>  	/* check we have all the required DT properties */
>>  	if (of_property_read_u32(dn, "ibm,my-drc-index", &drc_index)) {
>> @@ -1119,11 +1116,20 @@ static int papr_scm_probe(struct platform_device *pdev)
>>  		return -ENODEV;
>>  	}
>>  
>> -
>>  	p = kzalloc(sizeof(*p), GFP_KERNEL);
>>  	if (!p)
>>  		return -ENOMEM;
>>  
>> +	if (get_primary_and_secondary_domain(dn, &p->target_node, &numa_node)) {
>> +		dev_err(&pdev->dev, "%pOF: missing NUMA attributes!\n", dn);
>> +		rc = -ENODEV;
>> +		goto err;
>> +	}
>> +	p->numa_node = numa_map_to_online_node(numa_node);
>> +	if (numa_node != p->numa_node)
>> +		dev_info(&pdev->dev, "Region registered with online node %d and device tree node %d",
>> +			 p->numa_node, numa_node);
>> +
>>  	/* Initialize the dimm mutex */
>>  	mutex_init(&p->health_mutex);
>>  
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
>> index 663a0859cf13..9c2a1fc9ded1 100644
>> --- a/arch/powerpc/platforms/pseries/pseries.h
>> +++ b/arch/powerpc/platforms/pseries/pseries.h
>> @@ -114,4 +114,5 @@ void pseries_setup_security_mitigations(void);
>>  void pseries_lpar_read_hblkrm_characteristics(void);
>>  
>>  void update_numa_distance(struct device_node *node);
>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary);
>>  #endif /* _PSERIES_PSERIES_H */
>
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
David Gibson June 15, 2021, 6:34 a.m. UTC | #3
On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> >> FORM2 introduce a concept of secondary domain which is identical to the
> >> conceept of FORM1 primary domain. Use secondary domain as the numa node
> >> when using persistent memory device. For DAX kmem use the logical domain
> >> id introduced in FORM2. This new numa node
> >> 
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> ---
> >>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
> >>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
> >>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
> >>  3 files changed, 45 insertions(+), 10 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> index 86cd2af014f7..b9ac6d02e944 100644
> >> --- a/arch/powerpc/mm/numa.c
> >> +++ b/arch/powerpc/mm/numa.c
> >> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
> >>  	return nid;
> >>  }
> >>  
> >> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
> >> +{
> >> +	int secondary_index;
> >> +	const __be32 *associativity;
> >> +
> >> +	if (!numa_enabled) {
> >> +		*primary = NUMA_NO_NODE;
> >> +		*secondary = NUMA_NO_NODE;
> >> +		return 0;
> >> +	}
> >> +
> >> +	associativity = of_get_associativity(node);
> >> +	if (!associativity)
> >> +		return -ENODEV;
> >> +
> >> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> >> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
> >> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
> >
> > Secondary ID is always the second reference point, but primary depends
> > on the length of resources?  That seems very weird.
> 
> primary_domain_index is distance_ref_point[0]. With Form2 we would find
> both primary and secondary domain ID same for all resources other than
> persistent memory device. The usage w.r.t. persistent memory is
> explained in patch 7.

Right, I misunderstood

> 
> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
> the kernel should use when using persistent memory devices.

This seems kind of bogus.  With Form1, the primary/secondary ID are a
sort of heirarchy of distance (things with same primary ID are very
close, things with same secondary are kinda-close, etc.).  With Form2,
it's referring to their effective node for different purposes.

Using the same terms for different meanings seems unnecessarily
confusing.

> Persistent memory devices
> can also be used as regular memory using DAX KMEM driver and primary domainID indicates
> the numa node number OS should use when using these devices as regular memory. Secondary
> domainID is the numa node number that should be used when using this device as
> persistent memory.

It's weird to me that you'd want to consider them in different nodes
for those different purposes.

> In the later case, we are interested in the locality of the
> device to an established numa node. In the above example, if the last row represents a
> persistent memory device/resource, NUMA node number 40 will be used when using the device
> as regular memory and NUMA node number 0 will be the device numa node when using it as
> a persistent memory device.

I don't really get what you mean by "locality of the device to an
established numa node".  Or at least how that's different from
anything else we're handling here.
Aneesh Kumar K V June 15, 2021, 7:05 a.m. UTC | #4
David Gibson <david@gibson.dropbear.id.au> writes:

> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>> 
>> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>> >> FORM2 introduce a concept of secondary domain which is identical to the
>> >> conceept of FORM1 primary domain. Use secondary domain as the numa node
>> >> when using persistent memory device. For DAX kmem use the logical domain
>> >> id introduced in FORM2. This new numa node
>> >> 
>> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> >> ---
>> >>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>> >>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>> >>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
>> >>  3 files changed, 45 insertions(+), 10 deletions(-)
>> >> 
>> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> >> index 86cd2af014f7..b9ac6d02e944 100644
>> >> --- a/arch/powerpc/mm/numa.c
>> >> +++ b/arch/powerpc/mm/numa.c
>> >> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>> >>  	return nid;
>> >>  }
>> >>  
>> >> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>> >> +{
>> >> +	int secondary_index;
>> >> +	const __be32 *associativity;
>> >> +
>> >> +	if (!numa_enabled) {
>> >> +		*primary = NUMA_NO_NODE;
>> >> +		*secondary = NUMA_NO_NODE;
>> >> +		return 0;
>> >> +	}
>> >> +
>> >> +	associativity = of_get_associativity(node);
>> >> +	if (!associativity)
>> >> +		return -ENODEV;
>> >> +
>> >> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>> >> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>> >> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>> >
>> > Secondary ID is always the second reference point, but primary depends
>> > on the length of resources?  That seems very weird.
>> 
>> primary_domain_index is distance_ref_point[0]. With Form2 we would find
>> both primary and secondary domain ID same for all resources other than
>> persistent memory device. The usage w.r.t. persistent memory is
>> explained in patch 7.
>
> Right, I misunderstood
>
>> 
>> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>> the kernel should use when using persistent memory devices.
>
> This seems kind of bogus.  With Form1, the primary/secondary ID are a
> sort of heirarchy of distance (things with same primary ID are very
> close, things with same secondary are kinda-close, etc.).  With Form2,
> it's referring to their effective node for different purposes.
>
> Using the same terms for different meanings seems unnecessarily
> confusing.

They are essentially domainIDs. The interpretation of them are different
between Form1 and Form2. Hence I kept referring to them as primary and
secondary domainID. Any suggestion on what to name them with Form2?

>
>> Persistent memory devices
>> can also be used as regular memory using DAX KMEM driver and primary domainID indicates
>> the numa node number OS should use when using these devices as regular memory. Secondary
>> domainID is the numa node number that should be used when using this device as
>> persistent memory.
>
> It's weird to me that you'd want to consider them in different nodes
> for those different purposes.


   --------------------------------------
  |                            NUMA node0 |
  |    ProcA -----> MEMA                  |
  |     |                                 |
  |	|                                 |
  |	-------------------> PMEMB        |
  |                                       |
   ---------------------------------------

   ---------------------------------------
  |                            NUMA node1 |
  |                                       |
  |    ProcB -------> MEMC                |
  |	|                                 |
  |	-------------------> PMEMD        |
  |                                       |
  |                                       |
   ---------------------------------------
 

For a topology like the above application running of ProcA wants to find out
persistent memory mount local to its NUMA node. Hence when using it as
pmem fsdax mount or devdax device we want PMEMB to have associativity
of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
we want to use it as memory using dax kmem driver, we want both PMEMB
and PMEMD to appear as memory only NUMA node at a distance that is
derived based on the latency of the media. 

>
>> In the later case, we are interested in the locality of the
>> device to an established numa node. In the above example, if the last row represents a
>> persistent memory device/resource, NUMA node number 40 will be used when using the device
>> as regular memory and NUMA node number 0 will be the device numa node when using it as
>> a persistent memory device.
>
> I don't really get what you mean by "locality of the device to an
> established numa node".  Or at least how that's different from
> anything else we're handling here.


-aneesh
David Gibson June 17, 2021, 7:46 a.m. UTC | #5
On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> >> David Gibson <david@gibson.dropbear.id.au> writes:
> >> 
> >> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> >> >> FORM2 introduce a concept of secondary domain which is identical to the
> >> >> conceept of FORM1 primary domain. Use secondary domain as the numa node
> >> >> when using persistent memory device. For DAX kmem use the logical domain
> >> >> id introduced in FORM2. This new numa node
> >> >> 
> >> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> >> ---
> >> >>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
> >> >>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
> >> >>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
> >> >>  3 files changed, 45 insertions(+), 10 deletions(-)
> >> >> 
> >> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> >> index 86cd2af014f7..b9ac6d02e944 100644
> >> >> --- a/arch/powerpc/mm/numa.c
> >> >> +++ b/arch/powerpc/mm/numa.c
> >> >> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
> >> >>  	return nid;
> >> >>  }
> >> >>  
> >> >> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
> >> >> +{
> >> >> +	int secondary_index;
> >> >> +	const __be32 *associativity;
> >> >> +
> >> >> +	if (!numa_enabled) {
> >> >> +		*primary = NUMA_NO_NODE;
> >> >> +		*secondary = NUMA_NO_NODE;
> >> >> +		return 0;
> >> >> +	}
> >> >> +
> >> >> +	associativity = of_get_associativity(node);
> >> >> +	if (!associativity)
> >> >> +		return -ENODEV;
> >> >> +
> >> >> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> >> >> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
> >> >> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
> >> >
> >> > Secondary ID is always the second reference point, but primary depends
> >> > on the length of resources?  That seems very weird.
> >> 
> >> primary_domain_index is distance_ref_point[0]. With Form2 we would find
> >> both primary and secondary domain ID same for all resources other than
> >> persistent memory device. The usage w.r.t. persistent memory is
> >> explained in patch 7.
> >
> > Right, I misunderstood
> >
> >> 
> >> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
> >> the kernel should use when using persistent memory devices.
> >
> > This seems kind of bogus.  With Form1, the primary/secondary ID are a
> > sort of heirarchy of distance (things with same primary ID are very
> > close, things with same secondary are kinda-close, etc.).  With Form2,
> > it's referring to their effective node for different purposes.
> >
> > Using the same terms for different meanings seems unnecessarily
> > confusing.
> 
> They are essentially domainIDs. The interpretation of them are different
> between Form1 and Form2. Hence I kept referring to them as primary and
> secondary domainID. Any suggestion on what to name them with Form2?

My point is that reusing associativity-reference-points for something
with completely unrelated semantics seems like a very poor choice.

> >> Persistent memory devices
> >> can also be used as regular memory using DAX KMEM driver and primary domainID indicates
> >> the numa node number OS should use when using these devices as regular memory. Secondary
> >> domainID is the numa node number that should be used when using this device as
> >> persistent memory.
> >
> > It's weird to me that you'd want to consider them in different nodes
> > for those different purposes.
> 
> 
>    --------------------------------------
>   |                            NUMA node0 |
>   |    ProcA -----> MEMA                  |
>   |     |                                 |
>   |	|                                 |
>   |	-------------------> PMEMB        |
>   |                                       |
>    ---------------------------------------
> 
>    ---------------------------------------
>   |                            NUMA node1 |
>   |                                       |
>   |    ProcB -------> MEMC                |
>   |	|                                 |
>   |	-------------------> PMEMD        |
>   |                                       |
>   |                                       |
>    ---------------------------------------
>  
> 
> For a topology like the above application running of ProcA wants to find out
> persistent memory mount local to its NUMA node. Hence when using it as
> pmem fsdax mount or devdax device we want PMEMB to have associativity
> of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
> we want to use it as memory using dax kmem driver, we want both PMEMB
> and PMEMD to appear as memory only NUMA node at a distance that is
> derived based on the latency of the media.

I'm still not understanding why the latency we care about is different
in the two cases.  Can you give an example of when this would result
in different actual node assignments for the two different cases?
Daniel Henrique Barboza June 17, 2021, 10:53 a.m. UTC | #6
On 6/17/21 4:46 AM, David Gibson wrote:
> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>>
>>> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>
>>>>> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>>>>>> FORM2 introduce a concept of secondary domain which is identical to the
>>>>>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>>>>>> when using persistent memory device. For DAX kmem use the logical domain
>>>>>> id introduced in FORM2. This new numa node
>>>>>>
>>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>>> ---
>>>>>>   arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>>>>>>   arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>>>>>>   arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>>>>>   3 files changed, 45 insertions(+), 10 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>>>>> index 86cd2af014f7..b9ac6d02e944 100644
>>>>>> --- a/arch/powerpc/mm/numa.c
>>>>>> +++ b/arch/powerpc/mm/numa.c
>>>>>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>>>>>>   	return nid;
>>>>>>   }
>>>>>>   
>>>>>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>>>>>> +{
>>>>>> +	int secondary_index;
>>>>>> +	const __be32 *associativity;
>>>>>> +
>>>>>> +	if (!numa_enabled) {
>>>>>> +		*primary = NUMA_NO_NODE;
>>>>>> +		*secondary = NUMA_NO_NODE;
>>>>>> +		return 0;
>>>>>> +	}
>>>>>> +
>>>>>> +	associativity = of_get_associativity(node);
>>>>>> +	if (!associativity)
>>>>>> +		return -ENODEV;
>>>>>> +
>>>>>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>>>>>> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>>>>>> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>>>>>
>>>>> Secondary ID is always the second reference point, but primary depends
>>>>> on the length of resources?  That seems very weird.
>>>>
>>>> primary_domain_index is distance_ref_point[0]. With Form2 we would find
>>>> both primary and secondary domain ID same for all resources other than
>>>> persistent memory device. The usage w.r.t. persistent memory is
>>>> explained in patch 7.
>>>
>>> Right, I misunderstood
>>>
>>>>
>>>> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>>>> the kernel should use when using persistent memory devices.
>>>
>>> This seems kind of bogus.  With Form1, the primary/secondary ID are a
>>> sort of heirarchy of distance (things with same primary ID are very
>>> close, things with same secondary are kinda-close, etc.).  With Form2,
>>> it's referring to their effective node for different purposes.
>>>
>>> Using the same terms for different meanings seems unnecessarily
>>> confusing.
>>
>> They are essentially domainIDs. The interpretation of them are different
>> between Form1 and Form2. Hence I kept referring to them as primary and
>> secondary domainID. Any suggestion on what to name them with Form2?
> 
> My point is that reusing associativity-reference-points for something
> with completely unrelated semantics seems like a very poor choice.


I agree that this reuse can be confusing. I could argue that there is
precedent for that in PAPR - FORM0 puts a different spin on the same
property as well - but there is no need to keep following existing PAPR
practices in new spec (and some might argue it's best not to).

As far as QEMU goes, renaming this property to "numa-associativity-mode"
(just an example) is a quick change to do since we separated FORM1 and FORM2
code over there.

Doing such a rename can also help with the issue of having to describe new
FORM2 semantics using "least significant boundary" or "primary domain" or
any FORM0|FORM1 related terminology.


Thanks,


Daniel



> 
>>>> Persistent memory devices
>>>> can also be used as regular memory using DAX KMEM driver and primary domainID indicates
>>>> the numa node number OS should use when using these devices as regular memory. Secondary
>>>> domainID is the numa node number that should be used when using this device as
>>>> persistent memory.
>>>
>>> It's weird to me that you'd want to consider them in different nodes
>>> for those different purposes.
>>
>>
>>     --------------------------------------
>>    |                            NUMA node0 |
>>    |    ProcA -----> MEMA                  |
>>    |     |                                 |
>>    |	|                                 |
>>    |	-------------------> PMEMB        |
>>    |                                       |
>>     ---------------------------------------
>>
>>     ---------------------------------------
>>    |                            NUMA node1 |
>>    |                                       |
>>    |    ProcB -------> MEMC                |
>>    |	|                                 |
>>    |	-------------------> PMEMD        |
>>    |                                       |
>>    |                                       |
>>     ---------------------------------------
>>   
>>
>> For a topology like the above application running of ProcA wants to find out
>> persistent memory mount local to its NUMA node. Hence when using it as
>> pmem fsdax mount or devdax device we want PMEMB to have associativity
>> of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
>> we want to use it as memory using dax kmem driver, we want both PMEMB
>> and PMEMD to appear as memory only NUMA node at a distance that is
>> derived based on the latency of the media.
> 
> I'm still not understanding why the latency we care about is different
> in the two cases.  Can you give an example of when this would result
> in different actual node assignments for the two different cases?
>
Aneesh Kumar K V June 17, 2021, 10:59 a.m. UTC | #7
On 6/17/21 1:16 PM, David Gibson wrote:
> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>>
>>> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>>> David Gibson <david@gibson.dropbear.id.au> writes:

...

>>> It's weird to me that you'd want to consider them in different nodes
>>> for those different purposes.
>>
>>
>>     --------------------------------------
>>    |                            NUMA node0 |
>>    |    ProcA -----> MEMA                  |
>>    |     |                                 |
>>    |	|                                 |
>>    |	-------------------> PMEMB        |
>>    |                                       |
>>     ---------------------------------------
>>
>>     ---------------------------------------
>>    |                            NUMA node1 |
>>    |                                       |
>>    |    ProcB -------> MEMC                |
>>    |	|                                 |
>>    |	-------------------> PMEMD        |
>>    |                                       |
>>    |                                       |
>>     ---------------------------------------
>>   
>>
>> For a topology like the above application running of ProcA wants to find out
>> persistent memory mount local to its NUMA node. Hence when using it as
>> pmem fsdax mount or devdax device we want PMEMB to have associativity
>> of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
>> we want to use it as memory using dax kmem driver, we want both PMEMB
>> and PMEMD to appear as memory only NUMA node at a distance that is
>> derived based on the latency of the media.
> 
> I'm still not understanding why the latency we care about is different
> in the two cases.  Can you give an example of when this would result
> in different actual node assignments for the two different cases?
> 

In the above example in order allow use of PMEMB and PMEMD as memory 
only NUMA nodes
we need platform to represent them in its own domainID. Let's assume that
platform assigned id 40 and 41 and hence both PMEMB and PMEMD will have 
associativity array like below

{ 4, 6, 0}  -> PROCA/MEMA
{ 4, 6, 40} -> PMEMB
{ 4, 6, 41} -> PMEMD
{ 4, 6, 1} ->  PROCB/MEMB

When we want to use this device PMEMB and PMEMD as fsdax/devdax devices, 
we essentially look for the first nearest online node. Which means both 
PMEMB and PMEMD will appear as devices attached to node0. That is not 
ideal for for many applications.

using secondary domainID index as explained here helps to associate
each PMEM device to the right group. On a non virtualized config or hard 
partitioned config such a device tree representation can be looked at as 
a hint to identify which socket the actual device is connected to.

-aneesh
Aneesh Kumar K V June 17, 2021, 11:11 a.m. UTC | #8
Daniel Henrique Barboza <danielhb413@gmail.com> writes:

> On 6/17/21 4:46 AM, David Gibson wrote:
>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>
>>>> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>>
>>>>>> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>>>>>>> FORM2 introduce a concept of secondary domain which is identical to the
>>>>>>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>>>>>>> when using persistent memory device. For DAX kmem use the logical domain
>>>>>>> id introduced in FORM2. This new numa node
>>>>>>>
>>>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>>>> ---
>>>>>>>   arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>>>>>>>   arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>>>>>>>   arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>>>>>>   3 files changed, 45 insertions(+), 10 deletions(-)
>>>>>>>
>>>>>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>>>>>> index 86cd2af014f7..b9ac6d02e944 100644
>>>>>>> --- a/arch/powerpc/mm/numa.c
>>>>>>> +++ b/arch/powerpc/mm/numa.c
>>>>>>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>>>>>>>   	return nid;
>>>>>>>   }
>>>>>>>   
>>>>>>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>>>>>>> +{
>>>>>>> +	int secondary_index;
>>>>>>> +	const __be32 *associativity;
>>>>>>> +
>>>>>>> +	if (!numa_enabled) {
>>>>>>> +		*primary = NUMA_NO_NODE;
>>>>>>> +		*secondary = NUMA_NO_NODE;
>>>>>>> +		return 0;
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	associativity = of_get_associativity(node);
>>>>>>> +	if (!associativity)
>>>>>>> +		return -ENODEV;
>>>>>>> +
>>>>>>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>>>>>>> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>>>>>>> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>>>>>>
>>>>>> Secondary ID is always the second reference point, but primary depends
>>>>>> on the length of resources?  That seems very weird.
>>>>>
>>>>> primary_domain_index is distance_ref_point[0]. With Form2 we would find
>>>>> both primary and secondary domain ID same for all resources other than
>>>>> persistent memory device. The usage w.r.t. persistent memory is
>>>>> explained in patch 7.
>>>>
>>>> Right, I misunderstood
>>>>
>>>>>
>>>>> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>>>>> the kernel should use when using persistent memory devices.
>>>>
>>>> This seems kind of bogus.  With Form1, the primary/secondary ID are a
>>>> sort of heirarchy of distance (things with same primary ID are very
>>>> close, things with same secondary are kinda-close, etc.).  With Form2,
>>>> it's referring to their effective node for different purposes.
>>>>
>>>> Using the same terms for different meanings seems unnecessarily
>>>> confusing.
>>>
>>> They are essentially domainIDs. The interpretation of them are different
>>> between Form1 and Form2. Hence I kept referring to them as primary and
>>> secondary domainID. Any suggestion on what to name them with Form2?
>> 
>> My point is that reusing associativity-reference-points for something
>> with completely unrelated semantics seems like a very poor choice.
>
>
> I agree that this reuse can be confusing. I could argue that there is
> precedent for that in PAPR - FORM0 puts a different spin on the same
> property as well - but there is no need to keep following existing PAPR
> practices in new spec (and some might argue it's best not to).
>
> As far as QEMU goes, renaming this property to "numa-associativity-mode"
> (just an example) is a quick change to do since we separated FORM1 and FORM2
> code over there.
>
> Doing such a rename can also help with the issue of having to describe new
> FORM2 semantics using "least significant boundary" or "primary domain" or
> any FORM0|FORM1 related terminology.
>

It is not just changing the name, we will then have to explain the
meaning of ibm,associativity-reference-points with FORM2 right?

With FORM2 we want to represent the topology better

 --------------------------------------------------------------------------------
|                                                         domainID 20            |
|   ---------------------------------------                                      |
|  |                            NUMA node1 |                                     |
|  |                                       |            --------------------     |
|  |    ProcB -------> MEMC                |           |        NUMA node40 |    |
|  |	|                                  |           |                    |    |
|  |	---------------------------------- |-------->  |  PMEMD             |    |
|  |                                       |            --------------------     |
|  |                                       |                                     |
|   ---------------------------------------                                      |
 --------------------------------------------------------------------------------

ibm,associativity:
        { 20, 1, 40}  -> PMEMD
        { 20, 1, 1}  -> PROCB/MEMC

is the suggested FORM2 representation.

-aneesh
Aneesh Kumar K V June 17, 2021, 11:46 a.m. UTC | #9
On 6/17/21 4:41 PM, Aneesh Kumar K.V wrote:
> Daniel Henrique Barboza <danielhb413@gmail.com> writes:
> 
>> On 6/17/21 4:46 AM, David Gibson wrote:
>>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>
>>>>> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>>>
>>>>>>> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>>>>>>>> FORM2 introduce a concept of secondary domain which is identical to the
>>>>>>>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>>>>>>>> when using persistent memory device. For DAX kmem use the logical domain
>>>>>>>> id introduced in FORM2. This new numa node
>>>>>>>>
>>>>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>>>>> ---
>>>>>>>>    arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>>>>>>>>    arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>>>>>>>>    arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>>>>>>>    3 files changed, 45 insertions(+), 10 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>>>>>>> index 86cd2af014f7..b9ac6d02e944 100644
>>>>>>>> --- a/arch/powerpc/mm/numa.c
>>>>>>>> +++ b/arch/powerpc/mm/numa.c
>>>>>>>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>>>>>>>>    	return nid;
>>>>>>>>    }
>>>>>>>>    
>>>>>>>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>>>>>>>> +{
>>>>>>>> +	int secondary_index;
>>>>>>>> +	const __be32 *associativity;
>>>>>>>> +
>>>>>>>> +	if (!numa_enabled) {
>>>>>>>> +		*primary = NUMA_NO_NODE;
>>>>>>>> +		*secondary = NUMA_NO_NODE;
>>>>>>>> +		return 0;
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	associativity = of_get_associativity(node);
>>>>>>>> +	if (!associativity)
>>>>>>>> +		return -ENODEV;
>>>>>>>> +
>>>>>>>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>>>>>>>> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>>>>>>>> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>>>>>>>
>>>>>>> Secondary ID is always the second reference point, but primary depends
>>>>>>> on the length of resources?  That seems very weird.
>>>>>>
>>>>>> primary_domain_index is distance_ref_point[0]. With Form2 we would find
>>>>>> both primary and secondary domain ID same for all resources other than
>>>>>> persistent memory device. The usage w.r.t. persistent memory is
>>>>>> explained in patch 7.
>>>>>
>>>>> Right, I misunderstood
>>>>>
>>>>>>
>>>>>> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>>>>>> the kernel should use when using persistent memory devices.
>>>>>
>>>>> This seems kind of bogus.  With Form1, the primary/secondary ID are a
>>>>> sort of heirarchy of distance (things with same primary ID are very
>>>>> close, things with same secondary are kinda-close, etc.).  With Form2,
>>>>> it's referring to their effective node for different purposes.
>>>>>
>>>>> Using the same terms for different meanings seems unnecessarily
>>>>> confusing.
>>>>
>>>> They are essentially domainIDs. The interpretation of them are different
>>>> between Form1 and Form2. Hence I kept referring to them as primary and
>>>> secondary domainID. Any suggestion on what to name them with Form2?
>>>
>>> My point is that reusing associativity-reference-points for something
>>> with completely unrelated semantics seems like a very poor choice.
>>
>>
>> I agree that this reuse can be confusing. I could argue that there is
>> precedent for that in PAPR - FORM0 puts a different spin on the same
>> property as well - but there is no need to keep following existing PAPR
>> practices in new spec (and some might argue it's best not to).
>>
>> As far as QEMU goes, renaming this property to "numa-associativity-mode"
>> (just an example) is a quick change to do since we separated FORM1 and FORM2
>> code over there.
>>
>> Doing such a rename can also help with the issue of having to describe new
>> FORM2 semantics using "least significant boundary" or "primary domain" or
>> any FORM0|FORM1 related terminology.
>>
> 
> It is not just changing the name, we will then have to explain the
> meaning of ibm,associativity-reference-points with FORM2 right?
> 
> With FORM2 we want to represent the topology better
> 
>   --------------------------------------------------------------------------------
> |                                                         domainID 20            |
> |   ---------------------------------------                                      |
> |  |                            NUMA node1 |                                     |
> |  |                                       |            --------------------     |
> |  |    ProcB -------> MEMC                |           |        NUMA node40 |    |
> |  |	|                                  |           |                    |    |
> |  |	---------------------------------- |-------->  |  PMEMD             |    |
> |  |                                       |            --------------------     |
> |  |                                       |                                     |
> |   ---------------------------------------                                      |
>   --------------------------------------------------------------------------------
> 
> ibm,associativity:
>          { 20, 1, 40}  -> PMEMD
>          { 20, 1, 1}  -> PROCB/MEMC
> 
> is the suggested FORM2 representation.
> 
>

We can simplify this as below too

ibm,associativity:
         { 20, 1, 40}  -> PMEMD
         { 20, 1 }  -> PROCB/MEMC

-aneesh
Aneesh Kumar K V June 17, 2021, 1:55 p.m. UTC | #10
David Gibson <david@gibson.dropbear.id.au> writes:

> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>> 
>> > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>> >> David Gibson <david@gibson.dropbear.id.au> writes:
>> >> 
>> >> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
.....

> I'm still not understanding why the latency we care about is different
> in the two cases.  Can you give an example of when this would result
> in different actual node assignments for the two different cases?

How about the below update?

With Form2 "ibm,associativity" for resources is listed as below:

"ibm,associativity" property for resources in node 0, 8 and 40
{ 3, 6, 7, 0 }
{ 3, 6, 9, 8 }
{ 4, 6, 7, 0, 40}

With "ibm,associativity-reference-points"  { 0x3, 0x2 }

Form2 adds additional property which can be used with devices like persistence
memory devices which would also like to be presented as memory-only NUMA nodes.

"ibm,associativity-memory-node-reference-point" property contains a number
representing the domainID index to be used to find the domainID that should be used
when using the resource as memory only NUMA node. The NUMA distance information
w.r.t this domainID will take into consideration the latency of the media. A
high latency memory device will have a large NUMA distance value assigned w.r.t
the domainID found at at "ibm,associativity-memory-node-reference-point" domainID index.

prop-encoded-array: An integer encoded as with encode-int specifying the domainID index

In the above example:
"ibm,associativity-memory-node-reference-point"  { 0x4 }

ex:

   --------------------------------------
  |                            NUMA node0 |
  |    ProcA -----> MEMA                  |
  |     |                                 |
  |	|                                 |
  |	-------------------> PMEMB        |
  |                                       |
   ---------------------------------------

   ---------------------------------------
  |                            NUMA node1 |
  |                                       |
  |    ProcB -------> MEMC                |
  |	|                                 |
  |	-------------------> PMEMD        |
  |                                       |
  |                                       |
   ---------------------------------------

 --------------------------------------------------------------------------------
|                                                      domainID 20               |
|   ---------------------------------------                                      |
|  |                            NUMA node0 |                                     |
|  |                                       |            --------------------     |
|  |    ProcA -------> MEMA                |           |        NUMA node40 |    |
|  |	|                                  |           |                    |    |
|  |	---------------------------------- |-------->  |  PMEMB             |    |
|  |                                       |            --------------------     |
|  |                                       |                                     |
|   ---------------------------------------                                      |
|                                                                                |
|   ---------------------------------------                                      |
|  |                            NUMA node1 |                                     |
|  |                                       |                                     |
|  |    ProcB -------> MEMC                |           -------------------       |
|  |	|                                  |          |       NUMA node41 |      |
|  |	--------------------------------------------> | PMEMD             |      |
|  |                                       |           -------------------       |
|  |                                       |                                     |
|   ---------------------------------------                                      |
|                                                                                |
 --------------------------------------------------------------------------------

For a topology like the above application running of ProcA wants to find out
persistent memory mount local to its NUMA node. Hence when using it as
pmem fsdax mount or devdax device we want PMEMB to have associativity
of NUMA node0 and PMEMD to have associativity of NUMA node1. But when
we want to use it as memory using dax kmem driver, we want both PMEMB
and PMEMD to appear as memory only NUMA node at a distance that is
derived based on the latency of the media.

"ibm,associativity":
PROCA/MEMA -> { 2, 20, 0 } 
PROCB/MEMC -> { 2, 20, 1 } 
PMEMB      -> { 3, 20, 0, 40}
PMEMB      -> { 3, 20, 1, 41}

"ibm,associativity-reference-points" -> { 2, 1 }
"ibm,associativity-memory-node-reference-points" -> { 3 }

-aneesh
Aneesh Kumar K V June 17, 2021, 2:04 p.m. UTC | #11
Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> writes:

> David Gibson <david@gibson.dropbear.id.au> writes:
>
>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>> 
>>> > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>> >> David Gibson <david@gibson.dropbear.id.au> writes:
>>> >> 
>>> >> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> .....
>
>> I'm still not understanding why the latency we care about is different
>> in the two cases.  Can you give an example of when this would result
>> in different actual node assignments for the two different cases?
>
> How about the below update?
>
> With Form2 "ibm,associativity" for resources is listed as below:
>
> "ibm,associativity" property for resources in node 0, 8 and 40
> { 3, 6, 7, 0 }
> { 3, 6, 9, 8 }
> { 4, 6, 7, 0, 40}
>
> With "ibm,associativity-reference-points"  { 0x3, 0x2 }
>
> Form2 adds additional property which can be used with devices like persistence
> memory devices which would also like to be presented as memory-only NUMA nodes.
>
> "ibm,associativity-memory-node-reference-point" property contains a number
> representing the domainID index to be used to find the domainID that should be used
> when using the resource as memory only NUMA node. The NUMA distance information
> w.r.t this domainID will take into consideration the latency of the media. A
> high latency memory device will have a large NUMA distance value assigned w.r.t
> the domainID found at at "ibm,associativity-memory-node-reference-point" domainID index.
>
> prop-encoded-array: An integer encoded as with encode-int specifying the domainID index
>
> In the above example:
> "ibm,associativity-memory-node-reference-point"  { 0x4 }
>
> ex:
>
>    --------------------------------------
>   |                            NUMA node0 |
>   |    ProcA -----> MEMA                  |
>   |     |                                 |
>   |	|                                 |
>   |	-------------------> PMEMB        |
>   |                                       |
>    ---------------------------------------
>
>    ---------------------------------------
>   |                            NUMA node1 |
>   |                                       |
>   |    ProcB -------> MEMC                |
>   |	|                                 |
>   |	-------------------> PMEMD        |
>   |                                       |
>   |                                       |
>    ---------------------------------------
>
>  --------------------------------------------------------------------------------
> |                                                      domainID 20               |
> |   ---------------------------------------                                      |
> |  |                            NUMA node0 |                                     |
> |  |                                       |            --------------------     |
> |  |    ProcA -------> MEMA                |           |        NUMA node40 |    |
> |  |	|                                  |           |                    |    |
> |  |	---------------------------------- |-------->  |  PMEMB             |    |
> |  |                                       |            --------------------     |
> |  |                                       |                                     |
> |   ---------------------------------------                                      |
> |                                                                                |
> |   ---------------------------------------                                      |
> |  |                            NUMA node1 |                                     |
> |  |                                       |                                     |
> |  |    ProcB -------> MEMC                |           -------------------       |
> |  |	|                                  |          |       NUMA node41 |      |
> |  |	--------------------------------------------> | PMEMD             |      |
> |  |                                       |           -------------------       |
> |  |                                       |                                     |
> |   ---------------------------------------                                      |
> |                                                                                |
>  --------------------------------------------------------------------------------
>
> For a topology like the above application running of ProcA wants to find out
> persistent memory mount local to its NUMA node. Hence when using it as
> pmem fsdax mount or devdax device we want PMEMB to have associativity
> of NUMA node0 and PMEMD to have associativity of NUMA node1. But when
> we want to use it as memory using dax kmem driver, we want both PMEMB
> and PMEMD to appear as memory only NUMA node at a distance that is
> derived based on the latency of the media.
>
> "ibm,associativity":
> PROCA/MEMA -> { 2, 20, 0 } 
> PROCB/MEMC -> { 2, 20, 1 } 
> PMEMB      -> { 3, 20, 0, 40}
> PMEMB      -> { 3, 20, 1, 41}
>
> "ibm,associativity-reference-points" -> { 2, 1 }
> "ibm,associativity-memory-node-reference-points" -> { 3 }

Another option is to make sure that numa-distance-value is populated
such that PMEMB distance indicates it is closer to node0 when compared
to node1. ie, node_distance[40][0] < node_distance[40][1]. One could
possibly infer the grouping based on the distance value and not deepend
on ibm,associativity for that purpose.

-aneesh
Daniel Henrique Barboza June 17, 2021, 8 p.m. UTC | #12
On 6/17/21 8:11 AM, Aneesh Kumar K.V wrote:
> Daniel Henrique Barboza <danielhb413@gmail.com> writes:
> 
>> On 6/17/21 4:46 AM, David Gibson wrote:
>>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>
>>>>> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>>>
>>>>>>> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>>>>>>>> FORM2 introduce a concept of secondary domain which is identical to the
>>>>>>>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>>>>>>>> when using persistent memory device. For DAX kmem use the logical domain
>>>>>>>> id introduced in FORM2. This new numa node
>>>>>>>>
>>>>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>>>>> ---
>>>>>>>>    arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>>>>>>>>    arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>>>>>>>>    arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>>>>>>>    3 files changed, 45 insertions(+), 10 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>>>>>>> index 86cd2af014f7..b9ac6d02e944 100644
>>>>>>>> --- a/arch/powerpc/mm/numa.c
>>>>>>>> +++ b/arch/powerpc/mm/numa.c
>>>>>>>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>>>>>>>>    	return nid;
>>>>>>>>    }
>>>>>>>>    
>>>>>>>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>>>>>>>> +{
>>>>>>>> +	int secondary_index;
>>>>>>>> +	const __be32 *associativity;
>>>>>>>> +
>>>>>>>> +	if (!numa_enabled) {
>>>>>>>> +		*primary = NUMA_NO_NODE;
>>>>>>>> +		*secondary = NUMA_NO_NODE;
>>>>>>>> +		return 0;
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	associativity = of_get_associativity(node);
>>>>>>>> +	if (!associativity)
>>>>>>>> +		return -ENODEV;
>>>>>>>> +
>>>>>>>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>>>>>>>> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>>>>>>>> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>>>>>>>
>>>>>>> Secondary ID is always the second reference point, but primary depends
>>>>>>> on the length of resources?  That seems very weird.
>>>>>>
>>>>>> primary_domain_index is distance_ref_point[0]. With Form2 we would find
>>>>>> both primary and secondary domain ID same for all resources other than
>>>>>> persistent memory device. The usage w.r.t. persistent memory is
>>>>>> explained in patch 7.
>>>>>
>>>>> Right, I misunderstood
>>>>>
>>>>>>
>>>>>> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>>>>>> the kernel should use when using persistent memory devices.
>>>>>
>>>>> This seems kind of bogus.  With Form1, the primary/secondary ID are a
>>>>> sort of heirarchy of distance (things with same primary ID are very
>>>>> close, things with same secondary are kinda-close, etc.).  With Form2,
>>>>> it's referring to their effective node for different purposes.
>>>>>
>>>>> Using the same terms for different meanings seems unnecessarily
>>>>> confusing.
>>>>
>>>> They are essentially domainIDs. The interpretation of them are different
>>>> between Form1 and Form2. Hence I kept referring to them as primary and
>>>> secondary domainID. Any suggestion on what to name them with Form2?
>>>
>>> My point is that reusing associativity-reference-points for something
>>> with completely unrelated semantics seems like a very poor choice.
>>
>>
>> I agree that this reuse can be confusing. I could argue that there is
>> precedent for that in PAPR - FORM0 puts a different spin on the same
>> property as well - but there is no need to keep following existing PAPR
>> practices in new spec (and some might argue it's best not to).
>>
>> As far as QEMU goes, renaming this property to "numa-associativity-mode"
>> (just an example) is a quick change to do since we separated FORM1 and FORM2
>> code over there.
>>
>> Doing such a rename can also help with the issue of having to describe new
>> FORM2 semantics using "least significant boundary" or "primary domain" or
>> any FORM0|FORM1 related terminology.
>>
> 
> It is not just changing the name, we will then have to explain the
> meaning of ibm,associativity-reference-points with FORM2 right?

Hmmmm why? My idea over there was to add a new property that indicates that
resource might have a different NUMA affinity based on the mode of operation
(like PMEM), and get rid of ibm,associativity-reference-points altogether.

The NUMA distances already express the topology. Closer distances indicates
closer proximity, larger distances indicates otherwise. Having
"associativity-reference-points" to reflect a  associativity domain
relationship, when you already have all the distances from each node, is
somewhat redundant.

The concept of 'associativity domain' was necessary in FORM1 because we had no
other way of telling distance between NUMA nodes. We needed to rely on these
overly complex and convoluted subdomain abstractions to say that "nodeA belongs
to the same third-level domain as node B, and in the second-level domain with
node C". The kernel would read that and calculate that each level is doubling
the distance from the level before and local_distance is 10, so:

distAA = 10  distAB= 20 distAC = 40

With FORM2, if this information is already explicit in ibm,numa-distance-table,
why bother calculating associativity domains? If you want to know whether
PROCA is closer to PROCB or PROCX, just look at the NUMA distance table and
see which one is closer.

  

> 
> With FORM2 we want to represent the topology better
> 
>   --------------------------------------------------------------------------------
> |                                                         domainID 20            |
> |   ---------------------------------------                                      |
> |  |                            NUMA node1 |                                     |
> |  |                                       |            --------------------     |
> |  |    ProcB -------> MEMC                |           |        NUMA node40 |    |
> |  |	|                                  |           |                    |    |
> |  |	---------------------------------- |-------->  |  PMEMD             |    |
> |  |                                       |            --------------------     |
> |  |                                       |                                     |
> |   ---------------------------------------                                      |
>   --------------------------------------------------------------------------------
> 
> ibm,associativity:
>          { 20, 1, 40}  -> PMEMD
>          { 20, 1, 1}  -> PROCB/MEMC
> 
> is the suggested FORM2 representation.


The way I see it, the '20' over there is not needed at all. What utility it
brings? And why create an associativity domain '1' in the MEMC associativity
at 0x3?

What the current QEMU FORM2 implementation is doing would be this:

           { 0, 0, 1, 40}  -> PMEMD
           { 0, 0, 0, 1}  -> PROCB/MEMC


PMEMD has a pointer to the NUMA node in which it would run as persistent
memory, node 1. All the memory/cpu nodes of node1 would be oblivious
to what PMEMD is doing.

I don't see the need of creating an associativity domain between node1
and node40 in 0x3. Besides, if a device_add operation of a PMEM that wants
to use nodeN as the node for persistent memory would trigger a massive
ibm,associativity update, on all LMBs that belongs to nodeN, because then
everyone needs to have the same third level associativity domain as the
hotplugged PMEM. To avoid that, if the idea is to 'just duplicate the
logical_domain_id in 0x3 for all non-PMEM devices' then what's the
difference of looking into the logical_numa_id at 0x4 in the first
place?



In fact, the more I speak about this PMEM scenario the more I wonder:
why doesn't the PMEM driver, when switching from persistent to regular
memory and vice-versa, take care of all the necessary updates in the
numa-distance-table and kernel internals to reflect the current distances
of its current mode? Is this a technical limitation?



Thanks


Daniel


> 
> -aneesh
>
Aneesh Kumar K V June 18, 2021, 3:18 a.m. UTC | #13
On 6/18/21 1:30 AM, Daniel Henrique Barboza wrote:
> 
> 
> On 6/17/21 8:11 AM, Aneesh Kumar K.V wrote:
>> Daniel Henrique Barboza <danielhb413@gmail.com> writes:
>>
>>> On 6/17/21 4:46 AM, David Gibson wrote:
>>>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>

> 
> 
> In fact, the more I speak about this PMEM scenario the more I wonder:
> why doesn't the PMEM driver, when switching from persistent to regular
> memory and vice-versa, take care of all the necessary updates in the
> numa-distance-table and kernel internals to reflect the current distances
> of its current mode? Is this a technical limitation?
> 
> 

I sent v4 doing something similar to this .

-aneesh
David Gibson June 24, 2021, 3:16 a.m. UTC | #14
On Thu, Jun 17, 2021 at 04:29:01PM +0530, Aneesh Kumar K.V wrote:
> On 6/17/21 1:16 PM, David Gibson wrote:
> > On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
> > > David Gibson <david@gibson.dropbear.id.au> writes:
> > > 
> > > > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> > > > > David Gibson <david@gibson.dropbear.id.au> writes:
> 
> ...
> 
> > > > It's weird to me that you'd want to consider them in different nodes
> > > > for those different purposes.
> > > 
> > > 
> > >     --------------------------------------
> > >    |                            NUMA node0 |
> > >    |    ProcA -----> MEMA                  |
> > >    |     |                                 |
> > >    |	|                                 |
> > >    |	-------------------> PMEMB        |
> > >    |                                       |
> > >     ---------------------------------------
> > > 
> > >     ---------------------------------------
> > >    |                            NUMA node1 |
> > >    |                                       |
> > >    |    ProcB -------> MEMC                |
> > >    |	|                                 |
> > >    |	-------------------> PMEMD        |
> > >    |                                       |
> > >    |                                       |
> > >     ---------------------------------------
> > > 
> > > For a topology like the above application running of ProcA wants to find out
> > > persistent memory mount local to its NUMA node. Hence when using it as
> > > pmem fsdax mount or devdax device we want PMEMB to have associativity
> > > of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
> > > we want to use it as memory using dax kmem driver, we want both PMEMB
> > > and PMEMD to appear as memory only NUMA node at a distance that is
> > > derived based on the latency of the media.
> > 
> > I'm still not understanding why the latency we care about is different
> > in the two cases.  Can you give an example of when this would result
> > in different actual node assignments for the two different cases?
> > 
> 
> In the above example in order allow use of PMEMB and PMEMD as memory only
> NUMA nodes
> we need platform to represent them in its own domainID. Let's assume that
> platform assigned id 40 and 41 and hence both PMEMB and PMEMD will have
> associativity array like below
> 
> { 4, 6, 0}  -> PROCA/MEMA
> { 4, 6, 40} -> PMEMB
> { 4, 6, 41} -> PMEMD
> { 4, 6, 1} ->  PROCB/MEMB
> 
> When we want to use this device PMEMB and PMEMD as fsdax/devdax devices, we
> essentially look for the first nearest online node. Which means both PMEMB
> and PMEMD will appear as devices attached to node0. That is not ideal for
> for many applications.

Not if you actually look at the distance table which tells you that
PMEMB is closer to node0 and PMEMD is closer to node1.  That's exactly
what the distance table is for - making this information explicit,
rather than intuited from a confusing set of nested domains.
diff mbox series

Patch

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 86cd2af014f7..b9ac6d02e944 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -265,6 +265,34 @@  static int associativity_to_nid(const __be32 *associativity)
 	return nid;
 }
 
+int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
+{
+	int secondary_index;
+	const __be32 *associativity;
+
+	if (!numa_enabled) {
+		*primary = NUMA_NO_NODE;
+		*secondary = NUMA_NO_NODE;
+		return 0;
+	}
+
+	associativity = of_get_associativity(node);
+	if (!associativity)
+		return -ENODEV;
+
+	if (of_read_number(associativity, 1) >= primary_domain_index) {
+		*primary = of_read_number(&associativity[primary_domain_index], 1);
+		secondary_index = of_read_number(&distance_ref_points[1], 1);
+		*secondary = of_read_number(&associativity[secondary_index], 1);
+	}
+	if (*primary == 0xffff || *primary >= nr_node_ids)
+		*primary = NUMA_NO_NODE;
+
+	if (*secondary == 0xffff || *secondary >= nr_node_ids)
+		*secondary = NUMA_NO_NODE;
+	return 0;
+}
+
 /* Returns the nid associated with the given device tree node,
  * or -1 if not found.
  */
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index ef26fe40efb0..9bf2f1f3ddc5 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -18,6 +18,7 @@ 
 #include <asm/plpar_wrappers.h>
 #include <asm/papr_pdsm.h>
 #include <asm/mce.h>
+#include "pseries.h"
 
 #define BIND_ANY_ADDR (~0ul)
 
@@ -88,6 +89,8 @@  struct papr_scm_perf_stats {
 struct papr_scm_priv {
 	struct platform_device *pdev;
 	struct device_node *dn;
+	int numa_node;
+	int target_node;
 	uint32_t drc_index;
 	uint64_t blocks;
 	uint64_t block_size;
@@ -923,7 +926,6 @@  static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 	struct nd_mapping_desc mapping;
 	struct nd_region_desc ndr_desc;
 	unsigned long dimm_flags;
-	int target_nid, online_nid;
 	ssize_t stat_size;
 
 	p->bus_desc.ndctl = papr_scm_ndctl;
@@ -974,10 +976,8 @@  static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 	mapping.size = p->blocks * p->block_size; // XXX: potential overflow?
 
 	memset(&ndr_desc, 0, sizeof(ndr_desc));
-	target_nid = dev_to_node(&p->pdev->dev);
-	online_nid = numa_map_to_online_node(target_nid);
-	ndr_desc.numa_node = online_nid;
-	ndr_desc.target_node = target_nid;
+	ndr_desc.numa_node = p->numa_node;
+	ndr_desc.target_node = p->target_node;
 	ndr_desc.res = &p->res;
 	ndr_desc.of_node = p->dn;
 	ndr_desc.provider_data = p;
@@ -1001,9 +1001,6 @@  static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 				ndr_desc.res, p->dn);
 		goto err;
 	}
-	if (target_nid != online_nid)
-		dev_info(dev, "Region registered with target node %d and online node %d",
-			 target_nid, online_nid);
 
 	mutex_lock(&papr_ndr_lock);
 	list_add_tail(&p->region_list, &papr_nd_regions);
@@ -1096,7 +1093,7 @@  static int papr_scm_probe(struct platform_device *pdev)
 	struct papr_scm_priv *p;
 	const char *uuid_str;
 	u64 uuid[2];
-	int rc;
+	int rc, numa_node;
 
 	/* check we have all the required DT properties */
 	if (of_property_read_u32(dn, "ibm,my-drc-index", &drc_index)) {
@@ -1119,11 +1116,20 @@  static int papr_scm_probe(struct platform_device *pdev)
 		return -ENODEV;
 	}
 
-
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
 
+	if (get_primary_and_secondary_domain(dn, &p->target_node, &numa_node)) {
+		dev_err(&pdev->dev, "%pOF: missing NUMA attributes!\n", dn);
+		rc = -ENODEV;
+		goto err;
+	}
+	p->numa_node = numa_map_to_online_node(numa_node);
+	if (numa_node != p->numa_node)
+		dev_info(&pdev->dev, "Region registered with online node %d and device tree node %d",
+			 p->numa_node, numa_node);
+
 	/* Initialize the dimm mutex */
 	mutex_init(&p->health_mutex);
 
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 663a0859cf13..9c2a1fc9ded1 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -114,4 +114,5 @@  void pseries_setup_security_mitigations(void);
 void pseries_lpar_read_hblkrm_characteristics(void);
 
 void update_numa_distance(struct device_node *node);
+int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary);
 #endif /* _PSERIES_PSERIES_H */