diff mbox series

[v5,4/6] powerpc/pseries: Consolidate different NUMA distance update code paths

Message ID 20210628151117.545935-5-aneesh.kumar@linux.ibm.com (mailing list archive)
State Superseded
Headers show
Series Add support for FORM2 associativity | expand
Related show

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (0f7a719601eb957c10d417c62bd5f65080b5a409)
snowpatch_ozlabs/checkpatch warning total: 0 errors, 0 warnings, 4 checks, 267 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Aneesh Kumar K V June 28, 2021, 3:11 p.m. UTC
The associativity details of the newly added resourced are collected from
the hypervisor via "ibm,configure-connector" rtas call. Update the numa
distance details of the newly added numa node after the above call.

Instead of updating NUMA distance every time we lookup a node id
from the associativity property, add helpers that can be used
during boot which does this only once. Also remove the distance
update from node id lookup helpers.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/numa.c                        | 173 +++++++++++++-----
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
 .../platforms/pseries/hotplug-memory.c        |   2 +
 arch/powerpc/platforms/pseries/pseries.h      |   1 +
 4 files changed, 132 insertions(+), 46 deletions(-)

Comments

kernel test robot June 28, 2021, 8:21 p.m. UTC | #1
Hi "Aneesh,

I love your patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on v5.13 next-20210628]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Aneesh-Kumar-K-V/Add-support-for-FORM2-associativity/20210628-231546
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/fcbc8b19e99b1cf44fde904817f19616c6baecdb
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Aneesh-Kumar-K-V/Add-support-for-FORM2-associativity/20210628-231546
        git checkout fcbc8b19e99b1cf44fde904817f19616c6baecdb
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> arch/powerpc/mm/numa.c:298:6: warning: no previous prototype for 'update_numa_distance' [-Wmissing-prototypes]
     298 | void update_numa_distance(struct device_node *node)
         |      ^~~~~~~~~~~~~~~~~~~~


vim +/update_numa_distance +298 arch/powerpc/mm/numa.c

   294	
   295	/*
   296	 * Used to update distance information w.r.t newly added node.
   297	 */
 > 298	void update_numa_distance(struct device_node *node)
   299	{
   300		if (affinity_form == FORM0_AFFINITY)
   301			return;
   302		else if (affinity_form == FORM1_AFFINITY) {
   303			initialize_form1_numa_distance(node);
   304			return;
   305		}
   306	}
   307	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot June 28, 2021, 8:40 p.m. UTC | #2
Hi "Aneesh,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on v5.13 next-20210628]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Aneesh-Kumar-K-V/Add-support-for-FORM2-associativity/20210628-231546
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-randconfig-r024-20210628 (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/fcbc8b19e99b1cf44fde904817f19616c6baecdb
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Aneesh-Kumar-K-V/Add-support-for-FORM2-associativity/20210628-231546
        git checkout fcbc8b19e99b1cf44fde904817f19616c6baecdb
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arch/powerpc/mm/numa.c:298:6: warning: no previous prototype for 'update_numa_distance' [-Wmissing-prototypes]
     298 | void update_numa_distance(struct device_node *node)
         |      ^~~~~~~~~~~~~~~~~~~~
   arch/powerpc/mm/numa.c: In function 'parse_numa_properties':
>> arch/powerpc/mm/numa.c:809:7: error: implicit declaration of function '__vphn_get_associativity'; did you mean 'of_get_associativity'? [-Werror=implicit-function-declaration]
     809 |   if (__vphn_get_associativity(i, vphn_assoc) == 0) {
         |       ^~~~~~~~~~~~~~~~~~~~~~~~
         |       of_get_associativity
   cc1: some warnings being treated as errors


vim +809 arch/powerpc/mm/numa.c

   771	
   772	static int __init parse_numa_properties(void)
   773	{
   774		struct device_node *memory;
   775		int default_nid = 0;
   776		unsigned long i;
   777		const __be32 *associativity;
   778	
   779		if (numa_enabled == 0) {
   780			printk(KERN_WARNING "NUMA disabled by user\n");
   781			return -1;
   782		}
   783	
   784		primary_domain_index = find_primary_domain_index();
   785	
   786		if (primary_domain_index < 0) {
   787			/*
   788			 * if we fail to parse primary_domain_index from device tree
   789			 * mark the numa disabled, boot with numa disabled.
   790			 */
   791			numa_enabled = false;
   792			return primary_domain_index;
   793		}
   794	
   795		dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index);
   796	
   797		/*
   798		 * Even though we connect cpus to numa domains later in SMP
   799		 * init, we need to know the node ids now. This is because
   800		 * each node to be onlined must have NODE_DATA etc backing it.
   801		 */
   802		for_each_present_cpu(i) {
   803			__be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
   804			struct device_node *cpu;
   805			int nid = NUMA_NO_NODE;
   806	
   807			memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
   808	
 > 809			if (__vphn_get_associativity(i, vphn_assoc) == 0) {
   810				nid = associativity_to_nid(vphn_assoc);
   811				__initialize_form1_numa_distance(vphn_assoc);
   812			} else {
   813	
   814				/*
   815				 * Don't fall back to default_nid yet -- we will plug
   816				 * cpus into nodes once the memory scan has discovered
   817				 * the topology.
   818				 */
   819				cpu = of_get_cpu_node(i, NULL);
   820				BUG_ON(!cpu);
   821	
   822				associativity = of_get_associativity(cpu);
   823				if (associativity) {
   824					nid = associativity_to_nid(associativity);
   825					__initialize_form1_numa_distance(associativity);
   826				}
   827				of_node_put(cpu);
   828			}
   829	
   830			node_set_online(nid);
   831		}
   832	
   833		get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
   834	
   835		for_each_node_by_type(memory, "memory") {
   836			unsigned long start;
   837			unsigned long size;
   838			int nid;
   839			int ranges;
   840			const __be32 *memcell_buf;
   841			unsigned int len;
   842	
   843			memcell_buf = of_get_property(memory,
   844				"linux,usable-memory", &len);
   845			if (!memcell_buf || len <= 0)
   846				memcell_buf = of_get_property(memory, "reg", &len);
   847			if (!memcell_buf || len <= 0)
   848				continue;
   849	
   850			/* ranges in cell */
   851			ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
   852	new_range:
   853			/* these are order-sensitive, and modify the buffer pointer */
   854			start = read_n_cells(n_mem_addr_cells, &memcell_buf);
   855			size = read_n_cells(n_mem_size_cells, &memcell_buf);
   856	
   857			/*
   858			 * Assumption: either all memory nodes or none will
   859			 * have associativity properties.  If none, then
   860			 * everything goes to default_nid.
   861			 */
   862			associativity = of_get_associativity(memory);
   863			if (associativity) {
   864				nid = associativity_to_nid(associativity);
   865				__initialize_form1_numa_distance(associativity);
   866			} else
   867				nid = default_nid;
   868	
   869			fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
   870			node_set_online(nid);
   871	
   872			size = numa_enforce_memory_limit(start, size);
   873			if (size)
   874				memblock_set_node(start, size, &memblock.memory, nid);
   875	
   876			if (--ranges)
   877				goto new_range;
   878		}
   879	
   880		/*
   881		 * Now do the same thing for each MEMBLOCK listed in the
   882		 * ibm,dynamic-memory property in the
   883		 * ibm,dynamic-reconfiguration-memory node.
   884		 */
   885		memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
   886		if (memory) {
   887			walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb);
   888			of_node_put(memory);
   889		}
   890	
   891		return 0;
   892	}
   893	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
David Gibson July 22, 2021, 1:40 a.m. UTC | #3
On Mon, Jun 28, 2021 at 08:41:15PM +0530, Aneesh Kumar K.V wrote:
> The associativity details of the newly added resourced are collected from
> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> distance details of the newly added numa node after the above call.
> 
> Instead of updating NUMA distance every time we lookup a node id
> from the associativity property, add helpers that can be used
> during boot which does this only once. Also remove the distance
> update from node id lookup helpers.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/numa.c                        | 173 +++++++++++++-----
>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
>  .../platforms/pseries/hotplug-memory.c        |   2 +
>  arch/powerpc/platforms/pseries/pseries.h      |   1 +
>  4 files changed, 132 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 0ec16999beef..7b142f79d600 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -208,22 +208,6 @@ int __node_distance(int a, int b)
>  }
>  EXPORT_SYMBOL(__node_distance);
>  
> -static void initialize_distance_lookup_table(int nid,
> -		const __be32 *associativity)
> -{
> -	int i;
> -
> -	if (affinity_form != FORM1_AFFINITY)
> -		return;
> -
> -	for (i = 0; i < max_associativity_domain_index; i++) {
> -		const __be32 *entry;
> -
> -		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> -		distance_lookup_table[nid][i] = of_read_number(entry, 1);
> -	}
> -}
> -
>  /*
>   * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
>   * info is found.
> @@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 *associativity)
>  	/* POWER4 LPAR uses 0xffff as invalid node */
>  	if (nid == 0xffff || nid >= nr_node_ids)
>  		nid = NUMA_NO_NODE;
> -
> -	if (nid > 0 &&
> -		of_read_number(associativity, 1) >= max_associativity_domain_index) {
> -		/*
> -		 * Skip the length field and send start of associativity array
> -		 */
> -		initialize_distance_lookup_table(nid, associativity + 1);
> -	}
> -
>  out:
>  	return nid;
>  }
> @@ -287,6 +262,49 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> +static void __initialize_form1_numa_distance(const __be32 *associativity)
> +{
> +	int i, nid;
> +
> +	if (affinity_form != FORM1_AFFINITY)

Since this shouldn't be called on a !form1 system, this could be a WARN_ON().

> +		return;
> +
> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> +		nid = of_read_number(&associativity[primary_domain_index], 1);

This computes the nid from the assoc array independently of
associativity_to_nid, which doesn't seem like a good idea.  Wouldn't
it be better to call assocaitivity_to_nid(), then make the next bit
conditional on nid !== NUMA_NO_NODE?

> +
> +		for (i = 0; i < max_associativity_domain_index; i++) {
> +			const __be32 *entry;
> +
> +			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
> +			distance_lookup_table[nid][i] = of_read_number(entry, 1);
> +		}
> +	}
> +}
> +
> +static void initialize_form1_numa_distance(struct device_node *node)
> +{
> +	const __be32 *associativity;
> +
> +	associativity = of_get_associativity(node);
> +	if (!associativity)
> +		return;
> +
> +	__initialize_form1_numa_distance(associativity);
> +}
> +
> +/*
> + * Used to update distance information w.r.t newly added node.
> + */
> +void update_numa_distance(struct device_node *node)
> +{
> +	if (affinity_form == FORM0_AFFINITY)
> +		return;
> +	else if (affinity_form == FORM1_AFFINITY) {
> +		initialize_form1_numa_distance(node);
> +		return;
> +	}
> +}
> +
>  static int __init find_primary_domain_index(void)
>  {
>  	int index;
> @@ -433,6 +451,48 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
>  	return 0;
>  }
>  
> +static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
> +{
> +	struct assoc_arrays aa = { .arrays = NULL };
> +	int default_nid = NUMA_NO_NODE;
> +	int nid = default_nid;
> +	int rc, index;
> +
> +	if ((primary_domain_index < 0) || !numa_enabled)

Under what circumstances could you get primary_domain_index < 0?

> +		return default_nid;
> +
> +	rc = of_get_assoc_arrays(&aa);
> +	if (rc)
> +		return default_nid;
> +
> +	if (primary_domain_index <= aa.array_sz &&
> +	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
> +		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;

Does anywhere verify that primary_domain_index <= aa.array_sz?

> +		nid = of_read_number(&aa.arrays[index], 1);
> +
> +		if (nid == 0xffff || nid >= nr_node_ids)
> +			nid = default_nid;
> +		if (nid > 0 && affinity_form == FORM1_AFFINITY) {
> +			int i;
> +			const __be32 *associativity;
> +
> +			index = lmb->aa_index * aa.array_sz;
> +			associativity = &aa.arrays[index];
> +			/*
> +			 * lookup array associativity entries have different format
> +			 * There is no length of the array as the first element.

The difference it very small, and this is not a hot path.  Couldn't
you reduce a chunk of code by prepending aa.array_sz, then re-using
__initialize_form1_numa_distance.  Or even making
__initialize_form1_numa_distance() take the length as a parameter.

> +			 */
> +			for (i = 0; i < max_associativity_domain_index; i++) {
> +				const __be32 *entry;
> +
> +				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];

Does anywhere verify that distance_ref_points[i] <= aa.array_size for
every i?

> +				distance_lookup_table[nid][i] = of_read_number(entry, 1);
> +			}
> +		}
> +	}
> +	return nid;
> +}
> +
>  /*
>   * This is like of_node_to_nid_single() for memory represented in the
>   * ibm,dynamic-reconfiguration-memory node.
> @@ -458,21 +518,14 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>  
>  		if (nid == 0xffff || nid >= nr_node_ids)
>  			nid = default_nid;
> -
> -		if (nid > 0) {
> -			index = lmb->aa_index * aa.array_sz;
> -			initialize_distance_lookup_table(nid,
> -							&aa.arrays[index]);
> -		}
>  	}
> -
>  	return nid;
>  }
>  
>  #ifdef CONFIG_PPC_SPLPAR
> -static int vphn_get_nid(long lcpu)
> +
> +static int __vphn_get_associativity(long lcpu, __be32 *associativity)
>  {
> -	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
>  	long rc, hwid;
>  
>  	/*
> @@ -492,10 +545,22 @@ static int vphn_get_nid(long lcpu)
>  
>  		rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
>  		if (rc == H_SUCCESS)
> -			return associativity_to_nid(associativity);
> +			return 0;
>  	}
>  
> +	return -1;
> +}
> +
> +static int vphn_get_nid(long lcpu)
> +{
> +	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> +
> +
> +	if (!__vphn_get_associativity(lcpu, associativity))
> +		return associativity_to_nid(associativity);
> +
>  	return NUMA_NO_NODE;
> +
>  }
>  #else
>  static int vphn_get_nid(long unused)
> @@ -692,7 +757,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
>  			size = read_n_cells(n_mem_size_cells, usm);
>  		}
>  
> -		nid = of_drconf_to_nid_single(lmb);
> +		nid = get_nid_and_numa_distance(lmb);
>  		fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
>  					  &nid);
>  		node_set_online(nid);
> @@ -709,6 +774,7 @@ static int __init parse_numa_properties(void)
>  	struct device_node *memory;
>  	int default_nid = 0;
>  	unsigned long i;
> +	const __be32 *associativity;
>  
>  	if (numa_enabled == 0) {
>  		printk(KERN_WARNING "NUMA disabled by user\n");
> @@ -734,18 +800,30 @@ static int __init parse_numa_properties(void)
>  	 * each node to be onlined must have NODE_DATA etc backing it.
>  	 */
>  	for_each_present_cpu(i) {
> +		__be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
>  		struct device_node *cpu;
> -		int nid = vphn_get_nid(i);
> +		int nid = NUMA_NO_NODE;
>  
> -		/*
> -		 * Don't fall back to default_nid yet -- we will plug
> -		 * cpus into nodes once the memory scan has discovered
> -		 * the topology.
> -		 */
> -		if (nid == NUMA_NO_NODE) {
> +		memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));

What's the memset() for?  AFAICT you only look at vphn_assoc in the
branch where __vphn_get_associativity() succeeds.

> +
> +		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
> +			nid = associativity_to_nid(vphn_assoc);
> +			__initialize_form1_numa_distance(vphn_assoc);
> +		} else {
> +
> +			/*
> +			 * Don't fall back to default_nid yet -- we will plug
> +			 * cpus into nodes once the memory scan has discovered
> +			 * the topology.
> +			 */
>  			cpu = of_get_cpu_node(i, NULL);
>  			BUG_ON(!cpu);
> -			nid = of_node_to_nid_single(cpu);
> +
> +			associativity = of_get_associativity(cpu);
> +			if (associativity) {
> +				nid = associativity_to_nid(associativity);
> +				__initialize_form1_numa_distance(associativity);
> +			}
>  			of_node_put(cpu);
>  		}
>  
> @@ -781,8 +859,11 @@ static int __init parse_numa_properties(void)
>  		 * have associativity properties.  If none, then
>  		 * everything goes to default_nid.
>  		 */
> -		nid = of_node_to_nid_single(memory);
> -		if (nid < 0)
> +		associativity = of_get_associativity(memory);
> +		if (associativity) {
> +			nid = associativity_to_nid(associativity);
> +			__initialize_form1_numa_distance(associativity);
> +		} else
>  			nid = default_nid;
>  
>  		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 7e970f81d8ff..778b6ab35f0d 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>  		return saved_rc;
>  	}
>  
> +	update_numa_distance(dn);
> +
>  	rc = dlpar_online_cpu(dn);
>  	if (rc) {
>  		saved_rc = rc;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 36f66556a7c6..40d350f31a34 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
>  		return -ENODEV;
>  	}
>  
> +	update_numa_distance(lmb_node);
> +
>  	dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>  	if (!dr_node) {
>  		dlpar_free_cc_nodes(lmb_node);
> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> index 1f051a786fb3..663a0859cf13 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
>  void pseries_setup_security_mitigations(void);
>  void pseries_lpar_read_hblkrm_characteristics(void);
>  
> +void update_numa_distance(struct device_node *node);
>  #endif /* _PSERIES_PSERIES_H */
Aneesh Kumar K V July 22, 2021, 7:07 a.m. UTC | #4
David Gibson <david@gibson.dropbear.id.au> writes:

> On Mon, Jun 28, 2021 at 08:41:15PM +0530, Aneesh Kumar K.V wrote:
>> The associativity details of the newly added resourced are collected from
>> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
>> distance details of the newly added numa node after the above call.
>> 
>> Instead of updating NUMA distance every time we lookup a node id
>> from the associativity property, add helpers that can be used
>> during boot which does this only once. Also remove the distance
>> update from node id lookup helpers.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  arch/powerpc/mm/numa.c                        | 173 +++++++++++++-----
>>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
>>  .../platforms/pseries/hotplug-memory.c        |   2 +
>>  arch/powerpc/platforms/pseries/pseries.h      |   1 +
>>  4 files changed, 132 insertions(+), 46 deletions(-)
>> 
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index 0ec16999beef..7b142f79d600 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -208,22 +208,6 @@ int __node_distance(int a, int b)
>>  }
>>  EXPORT_SYMBOL(__node_distance);
>>  
>> -static void initialize_distance_lookup_table(int nid,
>> -		const __be32 *associativity)
>> -{
>> -	int i;
>> -
>> -	if (affinity_form != FORM1_AFFINITY)
>> -		return;
>> -
>> -	for (i = 0; i < max_associativity_domain_index; i++) {
>> -		const __be32 *entry;
>> -
>> -		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
>> -		distance_lookup_table[nid][i] = of_read_number(entry, 1);
>> -	}
>> -}
>> -
>>  /*
>>   * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
>>   * info is found.
>> @@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 *associativity)
>>  	/* POWER4 LPAR uses 0xffff as invalid node */
>>  	if (nid == 0xffff || nid >= nr_node_ids)
>>  		nid = NUMA_NO_NODE;
>> -
>> -	if (nid > 0 &&
>> -		of_read_number(associativity, 1) >= max_associativity_domain_index) {
>> -		/*
>> -		 * Skip the length field and send start of associativity array
>> -		 */
>> -		initialize_distance_lookup_table(nid, associativity + 1);
>> -	}
>> -
>>  out:
>>  	return nid;
>>  }
>> @@ -287,6 +262,49 @@ int of_node_to_nid(struct device_node *device)
>>  }
>>  EXPORT_SYMBOL(of_node_to_nid);
>>  
>> +static void __initialize_form1_numa_distance(const __be32 *associativity)
>> +{
>> +	int i, nid;
>> +
>> +	if (affinity_form != FORM1_AFFINITY)
>
> Since this shouldn't be called on a !form1 system, this could be a WARN_ON().

The way we call functions currently, instead of doing

if (affinity_form == FORM1_AFFINITY)
    __initialize_form1_numa_distance()

We avoid doing the if check in multiple places. For example
parse_numa_properties will fetch the associativity array to find the
details of online node and set it online. We use the same code path to
initialize distance.

		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
			nid = associativity_to_nid(vphn_assoc);
			__initialize_form1_numa_distance(vphn_assoc);
		} else {

			cpu = of_get_cpu_node(i, NULL);
			BUG_ON(!cpu);

			associativity = of_get_associativity(cpu);
			if (associativity) {
				nid = associativity_to_nid(associativity);
				__initialize_form1_numa_distance(associativity);
			}

We avoid the the if (affinity_form == FORM1_AFFINITY) check there by
moving the check inside __initialize_form1_numa_distance().


>
>> +		return;
>> +
>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>> +		nid = of_read_number(&associativity[primary_domain_index], 1);
>
> This computes the nid from the assoc array independently of
> associativity_to_nid, which doesn't seem like a good idea.  Wouldn't
> it be better to call assocaitivity_to_nid(), then make the next bit
> conditional on nid !== NUMA_NO_NODE?

@@ -302,9 +302,8 @@ static void __initialize_form1_numa_distance(const __be32 *associativity)
 	if (affinity_form != FORM1_AFFINITY)
 		return;
 
-	if (of_read_number(associativity, 1) >= primary_domain_index) {
-		nid = of_read_number(&associativity[primary_domain_index], 1);
-
+	nid = associativity_to_nid(associativity);
+	if (nid != NUMA_NO_NODE) {
 		for (i = 0; i < distance_ref_points_depth; i++) {
 			const __be32 *entry;
 

>
>> +
>> +		for (i = 0; i < max_associativity_domain_index; i++) {
>> +			const __be32 *entry;
>> +
>> +			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
>> +			distance_lookup_table[nid][i] = of_read_number(entry, 1);
>> +		}
>> +	}
>> +}
>> +
>> +static void initialize_form1_numa_distance(struct device_node *node)
>> +{
>> +	const __be32 *associativity;
>> +
>> +	associativity = of_get_associativity(node);
>> +	if (!associativity)
>> +		return;
>> +
>> +	__initialize_form1_numa_distance(associativity);
>> +}
>> +
>> +/*
>> + * Used to update distance information w.r.t newly added node.
>> + */
>> +void update_numa_distance(struct device_node *node)
>> +{
>> +	if (affinity_form == FORM0_AFFINITY)
>> +		return;
>> +	else if (affinity_form == FORM1_AFFINITY) {
>> +		initialize_form1_numa_distance(node);
>> +		return;
>> +	}
>> +}
>> +
>>  static int __init find_primary_domain_index(void)
>>  {
>>  	int index;
>> @@ -433,6 +451,48 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
>>  	return 0;
>>  }
>>  
>> +static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
>> +{
>> +	struct assoc_arrays aa = { .arrays = NULL };
>> +	int default_nid = NUMA_NO_NODE;
>> +	int nid = default_nid;
>> +	int rc, index;
>> +
>> +	if ((primary_domain_index < 0) || !numa_enabled)
>
> Under what circumstances could you get primary_domain_index < 0?

IIUC that is to handle failure to parse device tree.
ea9f5b702fe0215188fba2eda117419e4ae90a67

>
>> +		return default_nid;
>> +
>> +	rc = of_get_assoc_arrays(&aa);
>> +	if (rc)
>> +		return default_nid;
>> +
>> +	if (primary_domain_index <= aa.array_sz &&
>> +	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
>> +		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
>
> Does anywhere verify that primary_domain_index <= aa.array_sz?

That is the first part of the check?

>
>> +		nid = of_read_number(&aa.arrays[index], 1);
>> +
>> +		if (nid == 0xffff || nid >= nr_node_ids)
>> +			nid = default_nid;
>> +		if (nid > 0 && affinity_form == FORM1_AFFINITY) {
>> +			int i;
>> +			const __be32 *associativity;
>> +
>> +			index = lmb->aa_index * aa.array_sz;
>> +			associativity = &aa.arrays[index];
>> +			/*
>> +			 * lookup array associativity entries have different format
>> +			 * There is no length of the array as the first element.
>
> The difference it very small, and this is not a hot path.  Couldn't
> you reduce a chunk of code by prepending aa.array_sz, then re-using
> __initialize_form1_numa_distance.  Or even making
> __initialize_form1_numa_distance() take the length as a parameter.

The changes are small but confusing w.r.t how we look at the
associativity-lookup-arrays. The way we interpret associativity array
and associativity lookup array using primary_domain_index is different.
Hence the '-1' in the node lookup here. 

	index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
	nid = of_read_number(&aa.arrays[index], 1);


>
>> +			 */
>> +			for (i = 0; i < max_associativity_domain_index; i++) {
>> +				const __be32 *entry;
>> +
>> +				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
>
> Does anywhere verify that distance_ref_points[i] <= aa.array_size for
> every i?

We do check for 

	if (primary_domain_index <= aa.array_sz &&

>
>> +				distance_lookup_table[nid][i] = of_read_number(entry, 1);
>> +			}
>> +		}
>> +	}
>> +	return nid;
>> +}
>> +
>>  /*
>>   * This is like of_node_to_nid_single() for memory represented in the
>>   * ibm,dynamic-reconfiguration-memory node.
>> @@ -458,21 +518,14 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>>  
>>  		if (nid == 0xffff || nid >= nr_node_ids)
>>  			nid = default_nid;
>> -
>> -		if (nid > 0) {
>> -			index = lmb->aa_index * aa.array_sz;
>> -			initialize_distance_lookup_table(nid,
>> -							&aa.arrays[index]);
>> -		}
>>  	}
>> -
>>  	return nid;
>>  }
>>  
>>  #ifdef CONFIG_PPC_SPLPAR
>> -static int vphn_get_nid(long lcpu)
>> +
>> +static int __vphn_get_associativity(long lcpu, __be32 *associativity)
>>  {
>> -	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
>>  	long rc, hwid;
>>  
>>  	/*
>> @@ -492,10 +545,22 @@ static int vphn_get_nid(long lcpu)
>>  
>>  		rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
>>  		if (rc == H_SUCCESS)
>> -			return associativity_to_nid(associativity);
>> +			return 0;
>>  	}
>>  
>> +	return -1;
>> +}
>> +
>> +static int vphn_get_nid(long lcpu)
>> +{
>> +	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
>> +
>> +
>> +	if (!__vphn_get_associativity(lcpu, associativity))
>> +		return associativity_to_nid(associativity);
>> +
>>  	return NUMA_NO_NODE;
>> +
>>  }
>>  #else
>>  static int vphn_get_nid(long unused)
>> @@ -692,7 +757,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
>>  			size = read_n_cells(n_mem_size_cells, usm);
>>  		}
>>  
>> -		nid = of_drconf_to_nid_single(lmb);
>> +		nid = get_nid_and_numa_distance(lmb);
>>  		fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
>>  					  &nid);
>>  		node_set_online(nid);
>> @@ -709,6 +774,7 @@ static int __init parse_numa_properties(void)
>>  	struct device_node *memory;
>>  	int default_nid = 0;
>>  	unsigned long i;
>> +	const __be32 *associativity;
>>  
>>  	if (numa_enabled == 0) {
>>  		printk(KERN_WARNING "NUMA disabled by user\n");
>> @@ -734,18 +800,30 @@ static int __init parse_numa_properties(void)
>>  	 * each node to be onlined must have NODE_DATA etc backing it.
>>  	 */
>>  	for_each_present_cpu(i) {
>> +		__be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
>>  		struct device_node *cpu;
>> -		int nid = vphn_get_nid(i);
>> +		int nid = NUMA_NO_NODE;
>>  
>> -		/*
>> -		 * Don't fall back to default_nid yet -- we will plug
>> -		 * cpus into nodes once the memory scan has discovered
>> -		 * the topology.
>> -		 */
>> -		if (nid == NUMA_NO_NODE) {
>> +		memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
>
> What's the memset() for?  AFAICT you only look at vphn_assoc in the
> branch where __vphn_get_associativity() succeeds.

That was done to match the existing code. We do use a zero filled array
when making that hcall in this code path. I don't see us doing that
everywhere. But didn't want to change that behaviour in this patch.

-static int vphn_get_nid(long lcpu)
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
 {
 -	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
  	long rc, hwid;

>
>> +
>> +		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
>> +			nid = associativity_to_nid(vphn_assoc);
>> +			__initialize_form1_numa_distance(vphn_assoc);
>> +		} else {
>> +
>> +			/*
>> +			 * Don't fall back to default_nid yet -- we will plug
>> +			 * cpus into nodes once the memory scan has discovered
>> +			 * the topology.
>> +			 */
>>  			cpu = of_get_cpu_node(i, NULL);
>>  			BUG_ON(!cpu);
>> -			nid = of_node_to_nid_single(cpu);
>> +
>> +			associativity = of_get_associativity(cpu);
>> +			if (associativity) {
>> +				nid = associativity_to_nid(associativity);
>> +				__initialize_form1_numa_distance(associativity);
>> +			}
>>  			of_node_put(cpu);
>>  		}
>>  
>> @@ -781,8 +859,11 @@ static int __init parse_numa_properties(void)
>>  		 * have associativity properties.  If none, then
>>  		 * everything goes to default_nid.
>>  		 */
>> -		nid = of_node_to_nid_single(memory);
>> -		if (nid < 0)
>> +		associativity = of_get_associativity(memory);
>> +		if (associativity) {
>> +			nid = associativity_to_nid(associativity);
>> +			__initialize_form1_numa_distance(associativity);
>> +		} else
>>  			nid = default_nid;
>>  
>>  		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> index 7e970f81d8ff..778b6ab35f0d 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>>  		return saved_rc;
>>  	}
>>  
>> +	update_numa_distance(dn);
>> +
>>  	rc = dlpar_online_cpu(dn);
>>  	if (rc) {
>>  		saved_rc = rc;
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> index 36f66556a7c6..40d350f31a34 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
>>  		return -ENODEV;
>>  	}
>>  
>> +	update_numa_distance(lmb_node);
>> +
>>  	dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>>  	if (!dr_node) {
>>  		dlpar_free_cc_nodes(lmb_node);
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
>> index 1f051a786fb3..663a0859cf13 100644
>> --- a/arch/powerpc/platforms/pseries/pseries.h
>> +++ b/arch/powerpc/platforms/pseries/pseries.h
>> @@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
>>  void pseries_setup_security_mitigations(void);
>>  void pseries_lpar_read_hblkrm_characteristics(void);
>>  
>> +void update_numa_distance(struct device_node *node);
>>  #endif /* _PSERIES_PSERIES_H */
>
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
David Gibson July 26, 2021, 2:37 a.m. UTC | #5
On Thu, Jul 22, 2021 at 12:37:46PM +0530, Aneesh Kumar K.V wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Mon, Jun 28, 2021 at 08:41:15PM +0530, Aneesh Kumar K.V wrote:
> >> The associativity details of the newly added resourced are collected from
> >> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> >> distance details of the newly added numa node after the above call.
> >> 
> >> Instead of updating NUMA distance every time we lookup a node id
> >> from the associativity property, add helpers that can be used
> >> during boot which does this only once. Also remove the distance
> >> update from node id lookup helpers.
> >> 
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> ---
> >>  arch/powerpc/mm/numa.c                        | 173 +++++++++++++-----
> >>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
> >>  .../platforms/pseries/hotplug-memory.c        |   2 +
> >>  arch/powerpc/platforms/pseries/pseries.h      |   1 +
> >>  4 files changed, 132 insertions(+), 46 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> index 0ec16999beef..7b142f79d600 100644
> >> --- a/arch/powerpc/mm/numa.c
> >> +++ b/arch/powerpc/mm/numa.c
> >> @@ -208,22 +208,6 @@ int __node_distance(int a, int b)
> >>  }
> >>  EXPORT_SYMBOL(__node_distance);
> >>  
> >> -static void initialize_distance_lookup_table(int nid,
> >> -		const __be32 *associativity)
> >> -{
> >> -	int i;
> >> -
> >> -	if (affinity_form != FORM1_AFFINITY)
> >> -		return;
> >> -
> >> -	for (i = 0; i < max_associativity_domain_index; i++) {
> >> -		const __be32 *entry;
> >> -
> >> -		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> >> -		distance_lookup_table[nid][i] = of_read_number(entry, 1);
> >> -	}
> >> -}
> >> -
> >>  /*
> >>   * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> >>   * info is found.
> >> @@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 *associativity)
> >>  	/* POWER4 LPAR uses 0xffff as invalid node */
> >>  	if (nid == 0xffff || nid >= nr_node_ids)
> >>  		nid = NUMA_NO_NODE;
> >> -
> >> -	if (nid > 0 &&
> >> -		of_read_number(associativity, 1) >= max_associativity_domain_index) {
> >> -		/*
> >> -		 * Skip the length field and send start of associativity array
> >> -		 */
> >> -		initialize_distance_lookup_table(nid, associativity + 1);
> >> -	}
> >> -
> >>  out:
> >>  	return nid;
> >>  }
> >> @@ -287,6 +262,49 @@ int of_node_to_nid(struct device_node *device)
> >>  }
> >>  EXPORT_SYMBOL(of_node_to_nid);
> >>  
> >> +static void __initialize_form1_numa_distance(const __be32 *associativity)
> >> +{
> >> +	int i, nid;
> >> +
> >> +	if (affinity_form != FORM1_AFFINITY)
> >
> > Since this shouldn't be called on a !form1 system, this could be a WARN_ON().
> 
> The way we call functions currently, instead of doing
> 
> if (affinity_form == FORM1_AFFINITY)
>     __initialize_form1_numa_distance()
> 
> We avoid doing the if check in multiple places. For example
> parse_numa_properties will fetch the associativity array to find the
> details of online node and set it online. We use the same code path to
> initialize distance.
> 
> 		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
> 			nid = associativity_to_nid(vphn_assoc);
> 			__initialize_form1_numa_distance(vphn_assoc);
> 		} else {
> 
> 			cpu = of_get_cpu_node(i, NULL);
> 			BUG_ON(!cpu);
> 
> 			associativity = of_get_associativity(cpu);
> 			if (associativity) {
> 				nid = associativity_to_nid(associativity);
> 				__initialize_form1_numa_distance(associativity);
> 			}
> 
> We avoid the the if (affinity_form == FORM1_AFFINITY) check there by
> moving the check inside __initialize_form1_numa_distance().

Oh.. ok.  The only caller I spotted was already doing a test against
affinity_form.

> >> +		return;
> >> +
> >> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> >> +		nid = of_read_number(&associativity[primary_domain_index], 1);
> >
> > This computes the nid from the assoc array independently of
> > associativity_to_nid, which doesn't seem like a good idea.  Wouldn't
> > it be better to call assocaitivity_to_nid(), then make the next bit
> > conditional on nid !== NUMA_NO_NODE?
> 
> @@ -302,9 +302,8 @@ static void __initialize_form1_numa_distance(const __be32 *associativity)
>  	if (affinity_form != FORM1_AFFINITY)
>  		return;
>  
> -	if (of_read_number(associativity, 1) >= primary_domain_index) {
> -		nid = of_read_number(&associativity[primary_domain_index], 1);
> -
> +	nid = associativity_to_nid(associativity);
> +	if (nid != NUMA_NO_NODE) {
>  		for (i = 0; i < distance_ref_points_depth; i++) {
>  			const __be32 *entry;

Right.

> >
> >> +
> >> +		for (i = 0; i < max_associativity_domain_index; i++) {
> >> +			const __be32 *entry;
> >> +
> >> +			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
> >> +			distance_lookup_table[nid][i] = of_read_number(entry, 1);
> >> +		}
> >> +	}
> >> +}
> >> +
> >> +static void initialize_form1_numa_distance(struct device_node *node)
> >> +{
> >> +	const __be32 *associativity;
> >> +
> >> +	associativity = of_get_associativity(node);
> >> +	if (!associativity)
> >> +		return;
> >> +
> >> +	__initialize_form1_numa_distance(associativity);
> >> +}
> >> +
> >> +/*
> >> + * Used to update distance information w.r.t newly added node.
> >> + */
> >> +void update_numa_distance(struct device_node *node)
> >> +{
> >> +	if (affinity_form == FORM0_AFFINITY)
> >> +		return;
> >> +	else if (affinity_form == FORM1_AFFINITY) {
> >> +		initialize_form1_numa_distance(node);
> >> +		return;
> >> +	}
> >> +}
> >> +
> >>  static int __init find_primary_domain_index(void)
> >>  {
> >>  	int index;
> >> @@ -433,6 +451,48 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
> >>  	return 0;
> >>  }
> >>  
> >> +static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
> >> +{
> >> +	struct assoc_arrays aa = { .arrays = NULL };
> >> +	int default_nid = NUMA_NO_NODE;
> >> +	int nid = default_nid;
> >> +	int rc, index;
> >> +
> >> +	if ((primary_domain_index < 0) || !numa_enabled)
> >
> > Under what circumstances could you get primary_domain_index < 0?
> 
> IIUC that is to handle failure to parse device tree.
> ea9f5b702fe0215188fba2eda117419e4ae90a67

Ok.

> >
> >> +		return default_nid;

Returning NUMA_NO_NODE explicitly, rather than an alias to it might be
clearer here, but it's not a big detail.

> >> +
> >> +	rc = of_get_assoc_arrays(&aa);
> >> +	if (rc)
> >> +		return default_nid;
> >> +
> >> +	if (primary_domain_index <= aa.array_sz &&
> >> +	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
> >> +		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> >
> > Does anywhere verify that primary_domain_index <= aa.array_sz?
> 
> That is the first part of the check?

Oh, sorry, missed that.  I think I was expecting it to be an early
exit, rather than folded into the rest of this complex condition.

> 
> >
> >> +		nid = of_read_number(&aa.arrays[index], 1);
> >> +
> >> +		if (nid == 0xffff || nid >= nr_node_ids)
> >> +			nid = default_nid;
> >> +		if (nid > 0 && affinity_form == FORM1_AFFINITY) {
> >> +			int i;
> >> +			const __be32 *associativity;
> >> +
> >> +			index = lmb->aa_index * aa.array_sz;
> >> +			associativity = &aa.arrays[index];
> >> +			/*
> >> +			 * lookup array associativity entries have different format
> >> +			 * There is no length of the array as the first element.
> >
> > The difference it very small, and this is not a hot path.  Couldn't
> > you reduce a chunk of code by prepending aa.array_sz, then re-using
> > __initialize_form1_numa_distance.  Or even making
> > __initialize_form1_numa_distance() take the length as a parameter.
> 
> The changes are small but confusing w.r.t how we look at the
> associativity-lookup-arrays. The way we interpret associativity array
> and associativity lookup array using primary_domain_index is different.
> Hence the '-1' in the node lookup here.

They're really not, though.  It's exactly the same interpretation of
the associativity array itself - it's just that one of them has the
array prepended with a (redundant) length.  So you can make
__initialize_form1_numa_distance() work on the "bare" associativity
array, with a given length.  Here you call it with aa.array_sz as the
length, and in the other place you call it with prop[0] as the length.

> 
> 	index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> 	nid = of_read_number(&aa.arrays[index], 1);
> 
> 
> >
> >> +			 */
> >> +			for (i = 0; i < max_associativity_domain_index; i++) {
> >> +				const __be32 *entry;
> >> +
> >> +				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> >
> > Does anywhere verify that distance_ref_points[i] <= aa.array_size for
> > every i?
> 
> We do check for 
> 
> 	if (primary_domain_index <= aa.array_sz &&

Right, but that doesn't check the other distance_ref_points entries.
Not that there's any reason to have extra entries with Form2, but we
still don't want stray array accesses.

> 
> >
> >> +				distance_lookup_table[nid][i] = of_read_number(entry, 1);
> >> +			}
> >> +		}
> >> +	}
> >> +	return nid;
> >> +}
> >> +
> >>  /*
> >>   * This is like of_node_to_nid_single() for memory represented in the
> >>   * ibm,dynamic-reconfiguration-memory node.
> >> @@ -458,21 +518,14 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
> >>  
> >>  		if (nid == 0xffff || nid >= nr_node_ids)
> >>  			nid = default_nid;
> >> -
> >> -		if (nid > 0) {
> >> -			index = lmb->aa_index * aa.array_sz;
> >> -			initialize_distance_lookup_table(nid,
> >> -							&aa.arrays[index]);
> >> -		}
> >>  	}
> >> -
> >>  	return nid;
> >>  }
> >>  
> >>  #ifdef CONFIG_PPC_SPLPAR
> >> -static int vphn_get_nid(long lcpu)
> >> +
> >> +static int __vphn_get_associativity(long lcpu, __be32 *associativity)
> >>  {
> >> -	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> >>  	long rc, hwid;
> >>  
> >>  	/*
> >> @@ -492,10 +545,22 @@ static int vphn_get_nid(long lcpu)
> >>  
> >>  		rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
> >>  		if (rc == H_SUCCESS)
> >> -			return associativity_to_nid(associativity);
> >> +			return 0;
> >>  	}
> >>  
> >> +	return -1;
> >> +}
> >> +
> >> +static int vphn_get_nid(long lcpu)
> >> +{
> >> +	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> >> +
> >> +
> >> +	if (!__vphn_get_associativity(lcpu, associativity))
> >> +		return associativity_to_nid(associativity);
> >> +
> >>  	return NUMA_NO_NODE;
> >> +
> >>  }
> >>  #else
> >>  static int vphn_get_nid(long unused)
> >> @@ -692,7 +757,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
> >>  			size = read_n_cells(n_mem_size_cells, usm);
> >>  		}
> >>  
> >> -		nid = of_drconf_to_nid_single(lmb);
> >> +		nid = get_nid_and_numa_distance(lmb);
> >>  		fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
> >>  					  &nid);
> >>  		node_set_online(nid);
> >> @@ -709,6 +774,7 @@ static int __init parse_numa_properties(void)
> >>  	struct device_node *memory;
> >>  	int default_nid = 0;
> >>  	unsigned long i;
> >> +	const __be32 *associativity;
> >>  
> >>  	if (numa_enabled == 0) {
> >>  		printk(KERN_WARNING "NUMA disabled by user\n");
> >> @@ -734,18 +800,30 @@ static int __init parse_numa_properties(void)
> >>  	 * each node to be onlined must have NODE_DATA etc backing it.
> >>  	 */
> >>  	for_each_present_cpu(i) {
> >> +		__be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
> >>  		struct device_node *cpu;
> >> -		int nid = vphn_get_nid(i);
> >> +		int nid = NUMA_NO_NODE;
> >>  
> >> -		/*
> >> -		 * Don't fall back to default_nid yet -- we will plug
> >> -		 * cpus into nodes once the memory scan has discovered
> >> -		 * the topology.
> >> -		 */
> >> -		if (nid == NUMA_NO_NODE) {
> >> +		memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
> >
> > What's the memset() for?  AFAICT you only look at vphn_assoc in the
> > branch where __vphn_get_associativity() succeeds.
> 
> That was done to match the existing code. We do use a zero filled array
> when making that hcall in this code path. I don't see us doing that
> everywhere. But didn't want to change that behaviour in this patch.
> 
> -static int vphn_get_nid(long lcpu)
> +
> +static int __vphn_get_associativity(long lcpu, __be32 *associativity)
>  {
>  -	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
>   	long rc, hwid;

Ok, that makes sense.

> 
> >
> >> +
> >> +		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
> >> +			nid = associativity_to_nid(vphn_assoc);
> >> +			__initialize_form1_numa_distance(vphn_assoc);
> >> +		} else {
> >> +
> >> +			/*
> >> +			 * Don't fall back to default_nid yet -- we will plug
> >> +			 * cpus into nodes once the memory scan has discovered
> >> +			 * the topology.
> >> +			 */
> >>  			cpu = of_get_cpu_node(i, NULL);
> >>  			BUG_ON(!cpu);
> >> -			nid = of_node_to_nid_single(cpu);
> >> +
> >> +			associativity = of_get_associativity(cpu);
> >> +			if (associativity) {
> >> +				nid = associativity_to_nid(associativity);
> >> +				__initialize_form1_numa_distance(associativity);
> >> +			}
> >>  			of_node_put(cpu);
> >>  		}
> >>  
> >> @@ -781,8 +859,11 @@ static int __init parse_numa_properties(void)
> >>  		 * have associativity properties.  If none, then
> >>  		 * everything goes to default_nid.
> >>  		 */
> >> -		nid = of_node_to_nid_single(memory);
> >> -		if (nid < 0)
> >> +		associativity = of_get_associativity(memory);
> >> +		if (associativity) {
> >> +			nid = associativity_to_nid(associativity);
> >> +			__initialize_form1_numa_distance(associativity);
> >> +		} else
> >>  			nid = default_nid;
> >>  
> >>  		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
> >> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> >> index 7e970f81d8ff..778b6ab35f0d 100644
> >> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> >> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> >> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
> >>  		return saved_rc;
> >>  	}
> >>  
> >> +	update_numa_distance(dn);
> >> +
> >>  	rc = dlpar_online_cpu(dn);
> >>  	if (rc) {
> >>  		saved_rc = rc;
> >> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> index 36f66556a7c6..40d350f31a34 100644
> >> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
> >>  		return -ENODEV;
> >>  	}
> >>  
> >> +	update_numa_distance(lmb_node);
> >> +
> >>  	dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
> >>  	if (!dr_node) {
> >>  		dlpar_free_cc_nodes(lmb_node);
> >> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> >> index 1f051a786fb3..663a0859cf13 100644
> >> --- a/arch/powerpc/platforms/pseries/pseries.h
> >> +++ b/arch/powerpc/platforms/pseries/pseries.h
> >> @@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
> >>  void pseries_setup_security_mitigations(void);
> >>  void pseries_lpar_read_hblkrm_characteristics(void);
> >>  
> >> +void update_numa_distance(struct device_node *node);
> >>  #endif /* _PSERIES_PSERIES_H */
> >
>
Aneesh Kumar K V July 27, 2021, 3:32 a.m. UTC | #6
David Gibson <david@gibson.dropbear.id.au> writes:

> On Thu, Jul 22, 2021 at 12:37:46PM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>> 
>> > On Mon, Jun 28, 2021 at 08:41:15PM +0530, Aneesh Kumar K.V wrote:

....

> 
>> >
>> >> +		nid = of_read_number(&aa.arrays[index], 1);
>> >> +
>> >> +		if (nid == 0xffff || nid >= nr_node_ids)
>> >> +			nid = default_nid;
>> >> +		if (nid > 0 && affinity_form == FORM1_AFFINITY) {
>> >> +			int i;
>> >> +			const __be32 *associativity;
>> >> +
>> >> +			index = lmb->aa_index * aa.array_sz;
>> >> +			associativity = &aa.arrays[index];
>> >> +			/*
>> >> +			 * lookup array associativity entries have different format
>> >> +			 * There is no length of the array as the first element.
>> >
>> > The difference it very small, and this is not a hot path.  Couldn't
>> > you reduce a chunk of code by prepending aa.array_sz, then re-using
>> > __initialize_form1_numa_distance.  Or even making
>> > __initialize_form1_numa_distance() take the length as a parameter.
>> 
>> The changes are small but confusing w.r.t how we look at the
>> associativity-lookup-arrays. The way we interpret associativity array
>> and associativity lookup array using primary_domain_index is different.
>> Hence the '-1' in the node lookup here.
>
> They're really not, though.  It's exactly the same interpretation of
> the associativity array itself - it's just that one of them has the
> array prepended with a (redundant) length.  So you can make
> __initialize_form1_numa_distance() work on the "bare" associativity
> array, with a given length.  Here you call it with aa.array_sz as the
> length, and in the other place you call it with prop[0] as the length.
>
>> 
>> 	index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
>> 	nid = of_read_number(&aa.arrays[index], 1);
>> 
>> 
>> >
>> >> +			 */
>> >> +			for (i = 0; i < max_associativity_domain_index; i++) {
>> >> +				const __be32 *entry;
>> >> +
>> >> +				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
>> >
>> > Does anywhere verify that distance_ref_points[i] <= aa.array_size for
>> > every i?
>> 
>> We do check for 
>> 
>> 	if (primary_domain_index <= aa.array_sz &&
>
> Right, but that doesn't check the other distance_ref_points entries.
> Not that there's any reason to have extra entries with Form2, but we
> still don't want stray array accesses.

This is how the change looks. I am not convinced this makes it simpler.
I will add that as the last patch and we can drop that if we find that
not helpful? 

modified   arch/powerpc/mm/numa.c
@@ -171,20 +171,31 @@ static void unmap_cpu_from_node(unsigned long cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
 
-/*
- * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
- * info is found.
- */
-static int associativity_to_nid(const __be32 *associativity)
+static int __associativity_to_nid(const __be32 *associativity,
+				  bool lookup_array_assoc,
+				  int max_array_index)
 {
 	int nid = NUMA_NO_NODE;
+	int index;
 
 	if (!numa_enabled)
 		goto out;
+	/*
+	 * ibm,associativity-lookup-array doesn't have element
+	 * count at the start of the associativity. Hence
+	 * decrement the primary_domain_index when used with
+	 * lookup-array associativity.
+	 */
+	if (lookup_array_assoc)
+		index = primary_domain_index - 1;
+	else {
+		index = primary_domain_index;
+		max_array_index = of_read_number(associativity, 1);
+	}
+	if (index > max_array_index)
+		goto out;
 
-	if (of_read_number(associativity, 1) >= primary_domain_index)
-		nid = of_read_number(&associativity[primary_domain_index], 1);
-
+	nid = of_read_number(&associativity[index], 1);
 	/* POWER4 LPAR uses 0xffff as invalid node */
 	if (nid == 0xffff || nid >= nr_node_ids)
 		nid = NUMA_NO_NODE;
@@ -192,6 +203,15 @@ static int associativity_to_nid(const __be32 *associativity)
 	return nid;
 }
 
+/*
+ * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
+ * info is found.
+ */
+static inline int associativity_to_nid(const __be32 *associativity)
+{
+	return __associativity_to_nid(associativity, false, 0);
+}
+
 static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 {
 	int dist;
@@ -295,19 +315,38 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
-static void __initialize_form1_numa_distance(const __be32 *associativity)
+static void __initialize_form1_numa_distance(const __be32 *associativity,
+					     bool lookup_array_assoc,
+					     int max_array_index)
 {
 	int i, nid;
+	int index_offset = 0;
 
 	if (affinity_form != FORM1_AFFINITY)
 		return;
+	/*
+	 * ibm,associativity-lookup-array doesn't have element
+	 * count at the start of the associativity. Hence
+	 * decrement the distance_ref_points index when used with
+	 * lookup-array associativity.
+	 */
+	if (lookup_array_assoc)
+		index_offset = 1;
+	else
+		max_array_index = of_read_number(associativity, 1);
 
-	nid = associativity_to_nid(associativity);
+	nid = __associativity_to_nid(associativity, lookup_array_assoc, max_array_index);
 	if (nid != NUMA_NO_NODE) {
 		for (i = 0; i < distance_ref_points_depth; i++) {
 			const __be32 *entry;
+			int index = be32_to_cpu(distance_ref_points[i]) - index_offset;
 
-			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
+			/*
+			 * broken hierarchy, return with broken distance table
+			 */
+			if (index > max_array_index)
+				return;
+			entry = &associativity[index];
 			distance_lookup_table[nid][i] = of_read_number(entry, 1);
 		}
 	}
@@ -321,7 +360,7 @@ static void initialize_form1_numa_distance(struct device_node *node)
 	if (!associativity)
 		return;
 
-	__initialize_form1_numa_distance(associativity);
+	__initialize_form1_numa_distance(associativity, false, 0);
 }
 
 /*
@@ -586,27 +625,14 @@ static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
 
 	if (primary_domain_index <= aa.array_sz &&
 	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
-		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
-		nid = of_read_number(&aa.arrays[index], 1);
+		const __be32 *associativity;
 
-		if (nid == 0xffff || nid >= nr_node_ids)
-			nid = default_nid;
+		index = lmb->aa_index * aa.array_sz;
+		associativity = &aa.arrays[index];
+		nid = __associativity_to_nid(associativity, true, aa.array_sz - 1);
 		if (nid > 0 && affinity_form == FORM1_AFFINITY) {
-			int i;
-			const __be32 *associativity;
-
-			index = lmb->aa_index * aa.array_sz;
-			associativity = &aa.arrays[index];
-			/*
-			 * lookup array associativity entries have different format
-			 * There is no length of the array as the first element.
-			 */
-			for (i = 0; i < distance_ref_points_depth; i++) {
-				const __be32 *entry;
-
-				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
-				distance_lookup_table[nid][i] = of_read_number(entry, 1);
-			}
+			__initialize_form1_numa_distance(associativity,
+							 true, aa.array_sz - 1);
 		}
 	}
 	return nid;
@@ -632,9 +658,11 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
 
 	if (primary_domain_index <= aa.array_sz &&
 	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
-		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
-		nid = of_read_number(&aa.arrays[index], 1);
+		const __be32 *associativity;
 
+		index = lmb->aa_index * aa.array_sz;
+		associativity = &aa.arrays[index];
+		nid = __associativity_to_nid(associativity, true, aa.array_sz - 1);
 		if (nid == 0xffff || nid >= nr_node_ids)
 			nid = default_nid;
 	}
@@ -939,7 +967,7 @@ static int __init parse_numa_properties(void)
 
 		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
 			nid = associativity_to_nid(vphn_assoc);
-			__initialize_form1_numa_distance(vphn_assoc);
+			__initialize_form1_numa_distance(vphn_assoc, false, 0);
 		} else {
 
 			/*
@@ -953,7 +981,7 @@ static int __init parse_numa_properties(void)
 			associativity = of_get_associativity(cpu);
 			if (associativity) {
 				nid = associativity_to_nid(associativity);
-				__initialize_form1_numa_distance(associativity);
+				__initialize_form1_numa_distance(associativity, false, 0);
 			}
 			of_node_put(cpu);
 		}
@@ -993,7 +1021,7 @@ static int __init parse_numa_properties(void)
 		associativity = of_get_associativity(memory);
 		if (associativity) {
 			nid = associativity_to_nid(associativity);
-			__initialize_form1_numa_distance(associativity);
+			__initialize_form1_numa_distance(associativity, false, 0);
 		} else
 			nid = default_nid;
David Gibson July 27, 2021, 5:59 a.m. UTC | #7
On Tue, Jul 27, 2021 at 09:02:33AM +0530, Aneesh Kumar K.V wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Thu, Jul 22, 2021 at 12:37:46PM +0530, Aneesh Kumar K.V wrote:
> >> David Gibson <david@gibson.dropbear.id.au> writes:
> >> 
> >> > On Mon, Jun 28, 2021 at 08:41:15PM +0530, Aneesh Kumar K.V wrote:
> 
> ....
> 
> > 
> >> >
> >> >> +		nid = of_read_number(&aa.arrays[index], 1);
> >> >> +
> >> >> +		if (nid == 0xffff || nid >= nr_node_ids)
> >> >> +			nid = default_nid;
> >> >> +		if (nid > 0 && affinity_form == FORM1_AFFINITY) {
> >> >> +			int i;
> >> >> +			const __be32 *associativity;
> >> >> +
> >> >> +			index = lmb->aa_index * aa.array_sz;
> >> >> +			associativity = &aa.arrays[index];
> >> >> +			/*
> >> >> +			 * lookup array associativity entries have different format
> >> >> +			 * There is no length of the array as the first element.
> >> >
> >> > The difference it very small, and this is not a hot path.  Couldn't
> >> > you reduce a chunk of code by prepending aa.array_sz, then re-using
> >> > __initialize_form1_numa_distance.  Or even making
> >> > __initialize_form1_numa_distance() take the length as a parameter.
> >> 
> >> The changes are small but confusing w.r.t how we look at the
> >> associativity-lookup-arrays. The way we interpret associativity array
> >> and associativity lookup array using primary_domain_index is different.
> >> Hence the '-1' in the node lookup here.
> >
> > They're really not, though.  It's exactly the same interpretation of
> > the associativity array itself - it's just that one of them has the
> > array prepended with a (redundant) length.  So you can make
> > __initialize_form1_numa_distance() work on the "bare" associativity
> > array, with a given length.  Here you call it with aa.array_sz as the
> > length, and in the other place you call it with prop[0] as the length.
> >
> >> 
> >> 	index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> >> 	nid = of_read_number(&aa.arrays[index], 1);
> >> 
> >> 
> >> >
> >> >> +			 */
> >> >> +			for (i = 0; i < max_associativity_domain_index; i++) {
> >> >> +				const __be32 *entry;
> >> >> +
> >> >> +				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> >> >
> >> > Does anywhere verify that distance_ref_points[i] <= aa.array_size for
> >> > every i?
> >> 
> >> We do check for 
> >> 
> >> 	if (primary_domain_index <= aa.array_sz &&
> >
> > Right, but that doesn't check the other distance_ref_points entries.
> > Not that there's any reason to have extra entries with Form2, but we
> > still don't want stray array accesses.
> 
> This is how the change looks. I am not convinced this makes it simpler.

It's not, but that's because the lookup_array_assoc flag is not needed...

> I will add that as the last patch and we can drop that if we find that
> not helpful? 
> 
> modified   arch/powerpc/mm/numa.c
> @@ -171,20 +171,31 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> -/*
> - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> - * info is found.
> - */
> -static int associativity_to_nid(const __be32 *associativity)
> +static int __associativity_to_nid(const __be32 *associativity,
> +				  bool lookup_array_assoc,
> +				  int max_array_index)
>  {
>  	int nid = NUMA_NO_NODE;
> +	int index;
>  
>  	if (!numa_enabled)
>  		goto out;
> +	/*
> +	 * ibm,associativity-lookup-array doesn't have element
> +	 * count at the start of the associativity. Hence
> +	 * decrement the primary_domain_index when used with
> +	 * lookup-array associativity.
> +	 */
> +	if (lookup_array_assoc)
> +		index = primary_domain_index - 1;
> +	else {
> +		index = primary_domain_index;
> +		max_array_index = of_read_number(associativity, 1);
> +	}
> +	if (index > max_array_index)
> +		goto out;

So, the associativity-array-with-length is exactly a length, followed
by an associativity-array-without-length.  What I was suggesting is
you make this function only take an
associativity-array-without-length, with the length passed separately.

Where you want to use it on an associativity-array-with-length, stored
in __be32 *awl, you just invoke it as:
	associativity_to_nid(awl + 1, of_read_number(awl, 1));

> -	if (of_read_number(associativity, 1) >= primary_domain_index)
> -		nid = of_read_number(&associativity[primary_domain_index], 1);
> -
> +	nid = of_read_number(&associativity[index], 1);
>  	/* POWER4 LPAR uses 0xffff as invalid node */
>  	if (nid == 0xffff || nid >= nr_node_ids)
>  		nid = NUMA_NO_NODE;
> @@ -192,6 +203,15 @@ static int associativity_to_nid(const __be32 *associativity)
>  	return nid;
>  }
>  
> +/*
> + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> + * info is found.
> + */
> +static inline int associativity_to_nid(const __be32 *associativity)
> +{
> +	return __associativity_to_nid(associativity, false, 0);
> +}
> +
>  static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  {
>  	int dist;
> @@ -295,19 +315,38 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> -static void __initialize_form1_numa_distance(const __be32 *associativity)
> +static void __initialize_form1_numa_distance(const __be32 *associativity,
> +					     bool lookup_array_assoc,
> +					     int max_array_index)
>  {
>  	int i, nid;
> +	int index_offset = 0;
>  
>  	if (affinity_form != FORM1_AFFINITY)
>  		return;
> +	/*
> +	 * ibm,associativity-lookup-array doesn't have element
> +	 * count at the start of the associativity. Hence
> +	 * decrement the distance_ref_points index when used with
> +	 * lookup-array associativity.
> +	 */
> +	if (lookup_array_assoc)
> +		index_offset = 1;
> +	else
> +		max_array_index = of_read_number(associativity, 1);
>  
> -	nid = associativity_to_nid(associativity);
> +	nid = __associativity_to_nid(associativity, lookup_array_assoc, max_array_index);
>  	if (nid != NUMA_NO_NODE) {
>  		for (i = 0; i < distance_ref_points_depth; i++) {
>  			const __be32 *entry;
> +			int index = be32_to_cpu(distance_ref_points[i]) - index_offset;
>  
> -			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
> +			/*
> +			 * broken hierarchy, return with broken distance table
> +			 */
> +			if (index > max_array_index)
> +				return;
> +			entry = &associativity[index];
>  			distance_lookup_table[nid][i] = of_read_number(entry, 1);
>  		}
>  	}
> @@ -321,7 +360,7 @@ static void initialize_form1_numa_distance(struct device_node *node)
>  	if (!associativity)
>  		return;
>  
> -	__initialize_form1_numa_distance(associativity);
> +	__initialize_form1_numa_distance(associativity, false, 0);
>  }
>  
>  /*
> @@ -586,27 +625,14 @@ static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
>  
>  	if (primary_domain_index <= aa.array_sz &&
>  	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
> -		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> -		nid = of_read_number(&aa.arrays[index], 1);
> +		const __be32 *associativity;
>  
> -		if (nid == 0xffff || nid >= nr_node_ids)
> -			nid = default_nid;
> +		index = lmb->aa_index * aa.array_sz;
> +		associativity = &aa.arrays[index];
> +		nid = __associativity_to_nid(associativity, true, aa.array_sz - 1);
>  		if (nid > 0 && affinity_form == FORM1_AFFINITY) {
> -			int i;
> -			const __be32 *associativity;
> -
> -			index = lmb->aa_index * aa.array_sz;
> -			associativity = &aa.arrays[index];
> -			/*
> -			 * lookup array associativity entries have different format
> -			 * There is no length of the array as the first element.
> -			 */
> -			for (i = 0; i < distance_ref_points_depth; i++) {
> -				const __be32 *entry;
> -
> -				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> -				distance_lookup_table[nid][i] = of_read_number(entry, 1);
> -			}
> +			__initialize_form1_numa_distance(associativity,
> +							 true, aa.array_sz - 1);
>  		}
>  	}
>  	return nid;
> @@ -632,9 +658,11 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>  
>  	if (primary_domain_index <= aa.array_sz &&
>  	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
> -		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> -		nid = of_read_number(&aa.arrays[index], 1);
> +		const __be32 *associativity;
>  
> +		index = lmb->aa_index * aa.array_sz;
> +		associativity = &aa.arrays[index];
> +		nid = __associativity_to_nid(associativity, true, aa.array_sz - 1);
>  		if (nid == 0xffff || nid >= nr_node_ids)
>  			nid = default_nid;
>  	}
> @@ -939,7 +967,7 @@ static int __init parse_numa_properties(void)
>  
>  		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
>  			nid = associativity_to_nid(vphn_assoc);
> -			__initialize_form1_numa_distance(vphn_assoc);
> +			__initialize_form1_numa_distance(vphn_assoc, false, 0);
>  		} else {
>  
>  			/*
> @@ -953,7 +981,7 @@ static int __init parse_numa_properties(void)
>  			associativity = of_get_associativity(cpu);
>  			if (associativity) {
>  				nid = associativity_to_nid(associativity);
> -				__initialize_form1_numa_distance(associativity);
> +				__initialize_form1_numa_distance(associativity, false, 0);
>  			}
>  			of_node_put(cpu);
>  		}
> @@ -993,7 +1021,7 @@ static int __init parse_numa_properties(void)
>  		associativity = of_get_associativity(memory);
>  		if (associativity) {
>  			nid = associativity_to_nid(associativity);
> -			__initialize_form1_numa_distance(associativity);
> +			__initialize_form1_numa_distance(associativity, false, 0);
>  		} else
>  			nid = default_nid;
>  
>
diff mbox series

Patch

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 0ec16999beef..7b142f79d600 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -208,22 +208,6 @@  int __node_distance(int a, int b)
 }
 EXPORT_SYMBOL(__node_distance);
 
-static void initialize_distance_lookup_table(int nid,
-		const __be32 *associativity)
-{
-	int i;
-
-	if (affinity_form != FORM1_AFFINITY)
-		return;
-
-	for (i = 0; i < max_associativity_domain_index; i++) {
-		const __be32 *entry;
-
-		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
-		distance_lookup_table[nid][i] = of_read_number(entry, 1);
-	}
-}
-
 /*
  * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
  * info is found.
@@ -241,15 +225,6 @@  static int associativity_to_nid(const __be32 *associativity)
 	/* POWER4 LPAR uses 0xffff as invalid node */
 	if (nid == 0xffff || nid >= nr_node_ids)
 		nid = NUMA_NO_NODE;
-
-	if (nid > 0 &&
-		of_read_number(associativity, 1) >= max_associativity_domain_index) {
-		/*
-		 * Skip the length field and send start of associativity array
-		 */
-		initialize_distance_lookup_table(nid, associativity + 1);
-	}
-
 out:
 	return nid;
 }
@@ -287,6 +262,49 @@  int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
+static void __initialize_form1_numa_distance(const __be32 *associativity)
+{
+	int i, nid;
+
+	if (affinity_form != FORM1_AFFINITY)
+		return;
+
+	if (of_read_number(associativity, 1) >= primary_domain_index) {
+		nid = of_read_number(&associativity[primary_domain_index], 1);
+
+		for (i = 0; i < max_associativity_domain_index; i++) {
+			const __be32 *entry;
+
+			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
+			distance_lookup_table[nid][i] = of_read_number(entry, 1);
+		}
+	}
+}
+
+static void initialize_form1_numa_distance(struct device_node *node)
+{
+	const __be32 *associativity;
+
+	associativity = of_get_associativity(node);
+	if (!associativity)
+		return;
+
+	__initialize_form1_numa_distance(associativity);
+}
+
+/*
+ * Used to update distance information w.r.t newly added node.
+ */
+void update_numa_distance(struct device_node *node)
+{
+	if (affinity_form == FORM0_AFFINITY)
+		return;
+	else if (affinity_form == FORM1_AFFINITY) {
+		initialize_form1_numa_distance(node);
+		return;
+	}
+}
+
 static int __init find_primary_domain_index(void)
 {
 	int index;
@@ -433,6 +451,48 @@  static int of_get_assoc_arrays(struct assoc_arrays *aa)
 	return 0;
 }
 
+static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
+{
+	struct assoc_arrays aa = { .arrays = NULL };
+	int default_nid = NUMA_NO_NODE;
+	int nid = default_nid;
+	int rc, index;
+
+	if ((primary_domain_index < 0) || !numa_enabled)
+		return default_nid;
+
+	rc = of_get_assoc_arrays(&aa);
+	if (rc)
+		return default_nid;
+
+	if (primary_domain_index <= aa.array_sz &&
+	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
+		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
+		nid = of_read_number(&aa.arrays[index], 1);
+
+		if (nid == 0xffff || nid >= nr_node_ids)
+			nid = default_nid;
+		if (nid > 0 && affinity_form == FORM1_AFFINITY) {
+			int i;
+			const __be32 *associativity;
+
+			index = lmb->aa_index * aa.array_sz;
+			associativity = &aa.arrays[index];
+			/*
+			 * lookup array associativity entries have different format
+			 * There is no length of the array as the first element.
+			 */
+			for (i = 0; i < max_associativity_domain_index; i++) {
+				const __be32 *entry;
+
+				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
+				distance_lookup_table[nid][i] = of_read_number(entry, 1);
+			}
+		}
+	}
+	return nid;
+}
+
 /*
  * This is like of_node_to_nid_single() for memory represented in the
  * ibm,dynamic-reconfiguration-memory node.
@@ -458,21 +518,14 @@  int of_drconf_to_nid_single(struct drmem_lmb *lmb)
 
 		if (nid == 0xffff || nid >= nr_node_ids)
 			nid = default_nid;
-
-		if (nid > 0) {
-			index = lmb->aa_index * aa.array_sz;
-			initialize_distance_lookup_table(nid,
-							&aa.arrays[index]);
-		}
 	}
-
 	return nid;
 }
 
 #ifdef CONFIG_PPC_SPLPAR
-static int vphn_get_nid(long lcpu)
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
 {
-	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
 	long rc, hwid;
 
 	/*
@@ -492,10 +545,22 @@  static int vphn_get_nid(long lcpu)
 
 		rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
 		if (rc == H_SUCCESS)
-			return associativity_to_nid(associativity);
+			return 0;
 	}
 
+	return -1;
+}
+
+static int vphn_get_nid(long lcpu)
+{
+	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+
+
+	if (!__vphn_get_associativity(lcpu, associativity))
+		return associativity_to_nid(associativity);
+
 	return NUMA_NO_NODE;
+
 }
 #else
 static int vphn_get_nid(long unused)
@@ -692,7 +757,7 @@  static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
 			size = read_n_cells(n_mem_size_cells, usm);
 		}
 
-		nid = of_drconf_to_nid_single(lmb);
+		nid = get_nid_and_numa_distance(lmb);
 		fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
 					  &nid);
 		node_set_online(nid);
@@ -709,6 +774,7 @@  static int __init parse_numa_properties(void)
 	struct device_node *memory;
 	int default_nid = 0;
 	unsigned long i;
+	const __be32 *associativity;
 
 	if (numa_enabled == 0) {
 		printk(KERN_WARNING "NUMA disabled by user\n");
@@ -734,18 +800,30 @@  static int __init parse_numa_properties(void)
 	 * each node to be onlined must have NODE_DATA etc backing it.
 	 */
 	for_each_present_cpu(i) {
+		__be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
 		struct device_node *cpu;
-		int nid = vphn_get_nid(i);
+		int nid = NUMA_NO_NODE;
 
-		/*
-		 * Don't fall back to default_nid yet -- we will plug
-		 * cpus into nodes once the memory scan has discovered
-		 * the topology.
-		 */
-		if (nid == NUMA_NO_NODE) {
+		memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
+
+		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
+			nid = associativity_to_nid(vphn_assoc);
+			__initialize_form1_numa_distance(vphn_assoc);
+		} else {
+
+			/*
+			 * Don't fall back to default_nid yet -- we will plug
+			 * cpus into nodes once the memory scan has discovered
+			 * the topology.
+			 */
 			cpu = of_get_cpu_node(i, NULL);
 			BUG_ON(!cpu);
-			nid = of_node_to_nid_single(cpu);
+
+			associativity = of_get_associativity(cpu);
+			if (associativity) {
+				nid = associativity_to_nid(associativity);
+				__initialize_form1_numa_distance(associativity);
+			}
 			of_node_put(cpu);
 		}
 
@@ -781,8 +859,11 @@  static int __init parse_numa_properties(void)
 		 * have associativity properties.  If none, then
 		 * everything goes to default_nid.
 		 */
-		nid = of_node_to_nid_single(memory);
-		if (nid < 0)
+		associativity = of_get_associativity(memory);
+		if (associativity) {
+			nid = associativity_to_nid(associativity);
+			__initialize_form1_numa_distance(associativity);
+		} else
 			nid = default_nid;
 
 		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 7e970f81d8ff..778b6ab35f0d 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -498,6 +498,8 @@  static ssize_t dlpar_cpu_add(u32 drc_index)
 		return saved_rc;
 	}
 
+	update_numa_distance(dn);
+
 	rc = dlpar_online_cpu(dn);
 	if (rc) {
 		saved_rc = rc;
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 36f66556a7c6..40d350f31a34 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -180,6 +180,8 @@  static int update_lmb_associativity_index(struct drmem_lmb *lmb)
 		return -ENODEV;
 	}
 
+	update_numa_distance(lmb_node);
+
 	dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
 	if (!dr_node) {
 		dlpar_free_cc_nodes(lmb_node);
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 1f051a786fb3..663a0859cf13 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -113,4 +113,5 @@  extern u32 pseries_security_flavor;
 void pseries_setup_security_mitigations(void);
 void pseries_lpar_read_hblkrm_characteristics(void);
 
+void update_numa_distance(struct device_node *node);
 #endif /* _PSERIES_PSERIES_H */