[v2] npu2: Add nvlink2 interconnect information

Message ID 20181109054338.33932-1-aik@ozlabs.ru
State New
Headers show
Series
  • [v2] npu2: Add nvlink2 interconnect information
Related show

Checks

Context Check Description
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied

Commit Message

Alexey Kardashevskiy Nov. 9, 2018, 5:43 a.m.
GPUs on Redbud and Sequoia platforms are interconnected between each
other in groups of 2 or 3 GPUs. The problem with that is if we decide
to pass one of GPUs in a group to the userspace (and potentially a guest),
we need to make sure that interconnectd link does not get enabled.

The GPU firmware provides a way to disable links on a GPU. However we
want to disable only links to other GPUs which are not in the same guest
so we need a map of what nvlink is connected to what.

This adds an "ibm,nvlink-peers" property to every GPU in a "GPUn" slot
with phandles to peer GPUs and NPU PHB, the index in the property is GPU's
link number.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v2:
* s/ibm,nvlinks/ibm,nvlink-peers/
---
 hw/npu2.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

Comments

Alexey Kardashevskiy Nov. 19, 2018, 6:20 a.m. | #1
Ping?

In general, on systems without a magic vpd or even with it, it is not
very clear that the highest "ibm,npu-group-id" is the way to go. Can
anyone think of a better way to tell what system we got?



On 09/11/2018 16:43, Alexey Kardashevskiy wrote:
> GPUs on Redbud and Sequoia platforms are interconnected between each
> other in groups of 2 or 3 GPUs. The problem with that is if we decide
> to pass one of GPUs in a group to the userspace (and potentially a guest),
> we need to make sure that interconnectd link does not get enabled.
> 
> The GPU firmware provides a way to disable links on a GPU. However we
> want to disable only links to other GPUs which are not in the same guest
> so we need a map of what nvlink is connected to what.
> 
> This adds an "ibm,nvlink-peers" property to every GPU in a "GPUn" slot
> with phandles to peer GPUs and NPU PHB, the index in the property is GPU's
> link number.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v2:
> * s/ibm,nvlinks/ibm,nvlink-peers/
> ---
>  hw/npu2.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 87 insertions(+)
> 
> diff --git a/hw/npu2.c b/hw/npu2.c
> index d7d94357..ba1264be 100644
> --- a/hw/npu2.c
> +++ b/hw/npu2.c
> @@ -732,6 +732,91 @@ static void npu2_phb_fixup_scominit(struct dt_node *dn, int links_per_gpu)
>  	xscom_write_mask(gcid, 0x50114c0, val, mask);
>  }
>  
> +static int gpu_slot_to_num(const char *slot)
> +{
> +	char *p = NULL;
> +	int ret;
> +
> +	if (!slot)
> +		return -1;
> +
> +	if (memcmp(slot, "GPU", 3))
> +		return -1;
> +
> +	ret = strtol(slot + 3, &p, 10);
> +	if (*p || p == slot)
> +		return -1;
> +
> +	return ret;
> +}
> +
> +static void npu2_phb_nvlink_dt(struct phb *npuphb, int links_per_gpu)
> +{
> +	struct dt_node *g[3] = { 0 }; /* Current maximum is 3 GPUs per 1 NPU */
> +	const int max_gpus = 6 / links_per_gpu;
> +	struct npu2 *npu2_phb = phb_to_npu2_nvlink(npuphb);
> +	const u32 npuph = npuphb->dt_node->phandle;
> +	int i, gpuid, first = max_gpus, last = 0;
> +
> +	/* Find the indexes of GPUs connected to this NPU */
> +	for (i = 0; i < npu2_phb->total_devices; ++i) {
> +		gpuid = gpu_slot_to_num(npu2_phb->devices[i].nvlink.slot_label);
> +		if (gpuid < 0)
> +			continue;
> +		if (gpuid > last)
> +			last = gpuid;
> +		if (gpuid < first)
> +			first = gpuid;
> +	}
> +
> +	/* Either no "GPUx" slots found or they are not consecutive, abort */
> +	if (!last || last + 1 - first > max_gpus)
> +		return;
> +
> +	/* Collect GPU device nodes, sorted by an index from "GPUn" */
> +	for (i = 0; i < npu2_phb->total_devices; ++i) {
> +		gpuid = gpu_slot_to_num(npu2_phb->devices[i].nvlink.slot_label);
> +		g[gpuid - first] = npu2_phb->devices[i].nvlink.pd->dn;
> +	}
> +
> +	/*
> +	 * Store interconnect phandles in the device tree.
> +	 * The mapping is from Witherspoon_Design_Workbook_v1.7_19June2018.pdf,
> +	 * pages 39 (Sequoia), 40 (Redbud):
> +	 *   Figure 16: NVLink wiring diagram for planar with 6 GPUs
> +	 *   Figure 17: NVLink wiring diagram for planar with 4 GPUs
> +	 */
> +	switch (last + 1 - first) {
> +	case 2: /* Redbud */
> +		dt_add_property_cells(g[0], "ibm,nvlink-peers",
> +				      g[1]->phandle, npuph,
> +				      g[1]->phandle, npuph,
> +				      g[1]->phandle, npuph);
> +		dt_add_property_cells(g[1], "ibm,nvlink-peers",
> +				      g[0]->phandle, npuph,
> +				      g[0]->phandle, npuph,
> +				      g[0]->phandle, npuph);
> +		break;
> +	case 3: /* Sequoia */
> +		dt_add_property_cells(g[0], "ibm,nvlink-peers",
> +				      g[1]->phandle, npuph,
> +				      g[2]->phandle, g[2]->phandle,
> +				      g[1]->phandle, npuph);
> +		dt_add_property_cells(g[1], "ibm,nvlink-peers",
> +				      g[0]->phandle, npuph,
> +				      g[2]->phandle, g[2]->phandle,
> +				      g[0]->phandle, npuph);
> +		dt_add_property_cells(g[2], "ibm,nvlink-peers",
> +				      g[1]->phandle, g[0]->phandle,
> +				      g[1]->phandle, npuph,
> +				      g[0]->phandle, npuph);
> +		break;
> +	default:
> +		prlog(PR_NOTICE, "Failed to detect the exact platform\n");
> +		break;
> +	}
> +}
> +
>  static void npu2_phb_final_fixup(struct phb *phb)
>  {
>  	int links_per_gpu = 0;
> @@ -746,6 +831,8 @@ static void npu2_phb_final_fixup(struct phb *phb)
>  	pci_walk_dev(phb, NULL, npu2_links_per_gpu, &links_per_gpu);
>  	dt_for_each_compatible(dt_root, np, "ibm,power9-npu")
>  		npu2_phb_fixup_scominit(np, links_per_gpu);
> +
> +	npu2_phb_nvlink_dt(phb, links_per_gpu);
>  }
>  
>  static void npu2_init_ioda_cache(struct npu2 *p)
>
Alistair Popple Nov. 20, 2018, 12:09 a.m. | #2
On Monday, 19 November 2018 5:20:21 PM AEDT Alexey Kardashevskiy wrote:
> Ping?
> 
> In general, on systems without a magic vpd or even with it, it is not
> very clear that the highest "ibm,npu-group-id" is the way to go. Can
> anyone think of a better way to tell what system we got?

Agreed, although it seems we already use that method in platforms/astbmc/
witherspoon.c to set witherspoon_type so you should use the existing method 
rather than reimplementing it in hw/npu2.c.

Unfortunately firmware can't detect link topology, only the device driver can 
making that the logical and ideal place to do this. The only other option 
would be to have it programmed in VPD at manufacturing and read out via HDAT 
but that seems like it would be too hard leaving this as the best solution.

- Alistair

> On 09/11/2018 16:43, Alexey Kardashevskiy wrote:
> > GPUs on Redbud and Sequoia platforms are interconnected between each
> > other in groups of 2 or 3 GPUs. The problem with that is if we decide
> > to pass one of GPUs in a group to the userspace (and potentially a guest),
> > we need to make sure that interconnectd link does not get enabled.
> > 
> > The GPU firmware provides a way to disable links on a GPU. However we
> > want to disable only links to other GPUs which are not in the same guest
> > so we need a map of what nvlink is connected to what.
> > 
> > This adds an "ibm,nvlink-peers" property to every GPU in a "GPUn" slot
> > with phandles to peer GPUs and NPU PHB, the index in the property is GPU's
> > link number.
> > 
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > ---
> > Changes:
> > v2:
> > * s/ibm,nvlinks/ibm,nvlink-peers/
> > ---
> > 
> >  hw/npu2.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 87 insertions(+)
> > 
> > diff --git a/hw/npu2.c b/hw/npu2.c
> > index d7d94357..ba1264be 100644
> > --- a/hw/npu2.c
> > +++ b/hw/npu2.c
> > @@ -732,6 +732,91 @@ static void npu2_phb_fixup_scominit(struct dt_node
> > *dn, int links_per_gpu)> 
> >  	xscom_write_mask(gcid, 0x50114c0, val, mask);
> >  
> >  }
> > 
> > +static int gpu_slot_to_num(const char *slot)
> > +{
> > +	char *p = NULL;
> > +	int ret;
> > +
> > +	if (!slot)
> > +		return -1;
> > +
> > +	if (memcmp(slot, "GPU", 3))
> > +		return -1;
> > +
> > +	ret = strtol(slot + 3, &p, 10);
> > +	if (*p || p == slot)
> > +		return -1;
> > +
> > +	return ret;
> > +}
> > +
> > +static void npu2_phb_nvlink_dt(struct phb *npuphb, int links_per_gpu)
> > +{
> > +	struct dt_node *g[3] = { 0 }; /* Current maximum is 3 GPUs per 1 NPU 
*/
> > +	const int max_gpus = 6 / links_per_gpu;
> > +	struct npu2 *npu2_phb = phb_to_npu2_nvlink(npuphb);
> > +	const u32 npuph = npuphb->dt_node->phandle;
> > +	int i, gpuid, first = max_gpus, last = 0;
> > +
> > +	/* Find the indexes of GPUs connected to this NPU */
> > +	for (i = 0; i < npu2_phb->total_devices; ++i) {
> > +		gpuid = gpu_slot_to_num(npu2_phb->devices[i].nvlink.slot_label);
> > +		if (gpuid < 0)
> > +			continue;
> > +		if (gpuid > last)
> > +			last = gpuid;
> > +		if (gpuid < first)
> > +			first = gpuid;
> > +	}
> > +
> > +	/* Either no "GPUx" slots found or they are not consecutive, abort */
> > +	if (!last || last + 1 - first > max_gpus)
> > +		return;
> > +
> > +	/* Collect GPU device nodes, sorted by an index from "GPUn" */
> > +	for (i = 0; i < npu2_phb->total_devices; ++i) {
> > +		gpuid = gpu_slot_to_num(npu2_phb->devices[i].nvlink.slot_label);
> > +		g[gpuid - first] = npu2_phb->devices[i].nvlink.pd->dn;
> > +	}
> > +
> > +	/*
> > +	 * Store interconnect phandles in the device tree.
> > +	 * The mapping is from 
Witherspoon_Design_Workbook_v1.7_19June2018.pdf,
> > +	 * pages 39 (Sequoia), 40 (Redbud):
> > +	 *   Figure 16: NVLink wiring diagram for planar with 6 GPUs
> > +	 *   Figure 17: NVLink wiring diagram for planar with 4 GPUs
> > +	 */
> > +	switch (last + 1 - first) {
> > +	case 2: /* Redbud */
> > +		dt_add_property_cells(g[0], "ibm,nvlink-peers",
> > +				      g[1]->phandle, npuph,
> > +				      g[1]->phandle, npuph,
> > +				      g[1]->phandle, npuph);
> > +		dt_add_property_cells(g[1], "ibm,nvlink-peers",
> > +				      g[0]->phandle, npuph,
> > +				      g[0]->phandle, npuph,
> > +				      g[0]->phandle, npuph);
> > +		break;
> > +	case 3: /* Sequoia */
> > +		dt_add_property_cells(g[0], "ibm,nvlink-peers",
> > +				      g[1]->phandle, npuph,
> > +				      g[2]->phandle, g[2]->phandle,
> > +				      g[1]->phandle, npuph);
> > +		dt_add_property_cells(g[1], "ibm,nvlink-peers",
> > +				      g[0]->phandle, npuph,
> > +				      g[2]->phandle, g[2]->phandle,
> > +				      g[0]->phandle, npuph);
> > +		dt_add_property_cells(g[2], "ibm,nvlink-peers",
> > +				      g[1]->phandle, g[0]->phandle,
> > +				      g[1]->phandle, npuph,
> > +				      g[0]->phandle, npuph);
> > +		break;
> > +	default:
> > +		prlog(PR_NOTICE, "Failed to detect the exact platform\n");
> > +		break;
> > +	}
> > +}
> > +
> > 
> >  static void npu2_phb_final_fixup(struct phb *phb)
> >  {
> >  
> >  	int links_per_gpu = 0;
> > 
> > @@ -746,6 +831,8 @@ static void npu2_phb_final_fixup(struct phb *phb)
> > 
> >  	pci_walk_dev(phb, NULL, npu2_links_per_gpu, &links_per_gpu);
> >  	dt_for_each_compatible(dt_root, np, "ibm,power9-npu")
> >  	
> >  		npu2_phb_fixup_scominit(np, links_per_gpu);
> > 
> > +
> > +	npu2_phb_nvlink_dt(phb, links_per_gpu);
> > 
> >  }
> >  
> >  static void npu2_init_ioda_cache(struct npu2 *p)
Stewart Smith Nov. 20, 2018, 5:29 a.m. | #3
Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> GPUs on Redbud and Sequoia platforms are interconnected between each
> other in groups of 2 or 3 GPUs. The problem with that is if we decide
> to pass one of GPUs in a group to the userspace (and potentially a guest),
> we need to make sure that interconnectd link does not get enabled.
>
> The GPU firmware provides a way to disable links on a GPU. However we
> want to disable only links to other GPUs which are not in the same guest
> so we need a map of what nvlink is connected to what.
>
> This adds an "ibm,nvlink-peers" property to every GPU in a "GPUn" slot
> with phandles to peer GPUs and NPU PHB, the index in the property is GPU's
> link number.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v2:
> * s/ibm,nvlinks/ibm,nvlink-peers/
> ---
>  hw/npu2.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 87 insertions(+)

You'll also need to add something to doc/device-tree/nvlink (and maybe
also doc/nvlink?) documenting the new bindings.

> diff --git a/hw/npu2.c b/hw/npu2.c
> index d7d94357..ba1264be 100644
> --- a/hw/npu2.c
> +++ b/hw/npu2.c

I wonder if this shouldn't instead live in
platforms/astbmc/witherspoon.c and be more of a platfom property than
coding here.

> @@ -732,6 +732,91 @@ static void npu2_phb_fixup_scominit(struct dt_node *dn, int links_per_gpu)
>  	xscom_write_mask(gcid, 0x50114c0, val, mask);
>  }
>  
> +static int gpu_slot_to_num(const char *slot)
> +{
> +	char *p = NULL;
> +	int ret;
> +
> +	if (!slot)
> +		return -1;
> +
> +	if (memcmp(slot, "GPU", 3))
> +		return -1;
> +
> +	ret = strtol(slot + 3, &p, 10);
> +	if (*p || p == slot)
> +		return -1;
> +
> +	return ret;
> +}

I am left asking if there's a better way.... but maybe there isn't.

> +
> +static void npu2_phb_nvlink_dt(struct phb *npuphb, int links_per_gpu)
> +{
> +	struct dt_node *g[3] = { 0 }; /* Current maximum is 3 GPUs per 1 NPU */
> +	const int max_gpus = 6 / links_per_gpu;
> +	struct npu2 *npu2_phb = phb_to_npu2_nvlink(npuphb);
> +	const u32 npuph = npuphb->dt_node->phandle;
> +	int i, gpuid, first = max_gpus, last = 0;
> +
> +	/* Find the indexes of GPUs connected to this NPU */
> +	for (i = 0; i < npu2_phb->total_devices; ++i) {
> +		gpuid = gpu_slot_to_num(npu2_phb->devices[i].nvlink.slot_label);
> +		if (gpuid < 0)
> +			continue;
> +		if (gpuid > last)
> +			last = gpuid;
> +		if (gpuid < first)
> +			first = gpuid;
> +	}
> +
> +	/* Either no "GPUx" slots found or they are not consecutive, abort */
> +	if (!last || last + 1 - first > max_gpus)
> +		return;
> +
> +	/* Collect GPU device nodes, sorted by an index from "GPUn" */
> +	for (i = 0; i < npu2_phb->total_devices; ++i) {
> +		gpuid = gpu_slot_to_num(npu2_phb->devices[i].nvlink.slot_label);
> +		g[gpuid - first] = npu2_phb->devices[i].nvlink.pd->dn;
> +	}
> +
> +	/*
> +	 * Store interconnect phandles in the device tree.
> +	 * The mapping is from Witherspoon_Design_Workbook_v1.7_19June2018.pdf,
> +	 * pages 39 (Sequoia), 40 (Redbud):
> +	 *   Figure 16: NVLink wiring diagram for planar with 6 GPUs
> +	 *   Figure 17: NVLink wiring diagram for planar with 4 GPUs
> +	 */
> +	switch (last + 1 - first) {
> +	case 2: /* Redbud */
> +		dt_add_property_cells(g[0], "ibm,nvlink-peers",
> +				      g[1]->phandle, npuph,
> +				      g[1]->phandle, npuph,
> +				      g[1]->phandle, npuph);
> +		dt_add_property_cells(g[1], "ibm,nvlink-peers",
> +				      g[0]->phandle, npuph,
> +				      g[0]->phandle, npuph,
> +				      g[0]->phandle, npuph);
> +		break;
> +	case 3: /* Sequoia */
> +		dt_add_property_cells(g[0], "ibm,nvlink-peers",
> +				      g[1]->phandle, npuph,
> +				      g[2]->phandle, g[2]->phandle,
> +				      g[1]->phandle, npuph);
> +		dt_add_property_cells(g[1], "ibm,nvlink-peers",
> +				      g[0]->phandle, npuph,
> +				      g[2]->phandle, g[2]->phandle,
> +				      g[0]->phandle, npuph);
> +		dt_add_property_cells(g[2], "ibm,nvlink-peers",
> +				      g[1]->phandle, g[0]->phandle,
> +				      g[1]->phandle, npuph,
> +				      g[0]->phandle, npuph);
> +		break;
> +	default:
> +		prlog(PR_NOTICE, "Failed to detect the exact
> platform\n");

I'd suggest PR_ERROR as no doubt somebody is going to hit this when
making a witherspoon like platform.

> +		break;
> +	}
> +}

Actually, I tihnk I am convinced that this should live in the
witherspoon platform support.

>  static void npu2_phb_final_fixup(struct phb *phb)
>  {
>  	int links_per_gpu = 0;
> @@ -746,6 +831,8 @@ static void npu2_phb_final_fixup(struct phb *phb)
>  	pci_walk_dev(phb, NULL, npu2_links_per_gpu, &links_per_gpu);
>  	dt_for_each_compatible(dt_root, np, "ibm,power9-npu")
>  		npu2_phb_fixup_scominit(np, links_per_gpu);
> +
> +	npu2_phb_nvlink_dt(phb, links_per_gpu);
>  }

Would it also be possible to add something to op-test that would check
for the presence of the topology?

Patch

diff --git a/hw/npu2.c b/hw/npu2.c
index d7d94357..ba1264be 100644
--- a/hw/npu2.c
+++ b/hw/npu2.c
@@ -732,6 +732,91 @@  static void npu2_phb_fixup_scominit(struct dt_node *dn, int links_per_gpu)
 	xscom_write_mask(gcid, 0x50114c0, val, mask);
 }
 
+static int gpu_slot_to_num(const char *slot)
+{
+	char *p = NULL;
+	int ret;
+
+	if (!slot)
+		return -1;
+
+	if (memcmp(slot, "GPU", 3))
+		return -1;
+
+	ret = strtol(slot + 3, &p, 10);
+	if (*p || p == slot)
+		return -1;
+
+	return ret;
+}
+
+static void npu2_phb_nvlink_dt(struct phb *npuphb, int links_per_gpu)
+{
+	struct dt_node *g[3] = { 0 }; /* Current maximum is 3 GPUs per 1 NPU */
+	const int max_gpus = 6 / links_per_gpu;
+	struct npu2 *npu2_phb = phb_to_npu2_nvlink(npuphb);
+	const u32 npuph = npuphb->dt_node->phandle;
+	int i, gpuid, first = max_gpus, last = 0;
+
+	/* Find the indexes of GPUs connected to this NPU */
+	for (i = 0; i < npu2_phb->total_devices; ++i) {
+		gpuid = gpu_slot_to_num(npu2_phb->devices[i].nvlink.slot_label);
+		if (gpuid < 0)
+			continue;
+		if (gpuid > last)
+			last = gpuid;
+		if (gpuid < first)
+			first = gpuid;
+	}
+
+	/* Either no "GPUx" slots found or they are not consecutive, abort */
+	if (!last || last + 1 - first > max_gpus)
+		return;
+
+	/* Collect GPU device nodes, sorted by an index from "GPUn" */
+	for (i = 0; i < npu2_phb->total_devices; ++i) {
+		gpuid = gpu_slot_to_num(npu2_phb->devices[i].nvlink.slot_label);
+		g[gpuid - first] = npu2_phb->devices[i].nvlink.pd->dn;
+	}
+
+	/*
+	 * Store interconnect phandles in the device tree.
+	 * The mapping is from Witherspoon_Design_Workbook_v1.7_19June2018.pdf,
+	 * pages 39 (Sequoia), 40 (Redbud):
+	 *   Figure 16: NVLink wiring diagram for planar with 6 GPUs
+	 *   Figure 17: NVLink wiring diagram for planar with 4 GPUs
+	 */
+	switch (last + 1 - first) {
+	case 2: /* Redbud */
+		dt_add_property_cells(g[0], "ibm,nvlink-peers",
+				      g[1]->phandle, npuph,
+				      g[1]->phandle, npuph,
+				      g[1]->phandle, npuph);
+		dt_add_property_cells(g[1], "ibm,nvlink-peers",
+				      g[0]->phandle, npuph,
+				      g[0]->phandle, npuph,
+				      g[0]->phandle, npuph);
+		break;
+	case 3: /* Sequoia */
+		dt_add_property_cells(g[0], "ibm,nvlink-peers",
+				      g[1]->phandle, npuph,
+				      g[2]->phandle, g[2]->phandle,
+				      g[1]->phandle, npuph);
+		dt_add_property_cells(g[1], "ibm,nvlink-peers",
+				      g[0]->phandle, npuph,
+				      g[2]->phandle, g[2]->phandle,
+				      g[0]->phandle, npuph);
+		dt_add_property_cells(g[2], "ibm,nvlink-peers",
+				      g[1]->phandle, g[0]->phandle,
+				      g[1]->phandle, npuph,
+				      g[0]->phandle, npuph);
+		break;
+	default:
+		prlog(PR_NOTICE, "Failed to detect the exact platform\n");
+		break;
+	}
+}
+
 static void npu2_phb_final_fixup(struct phb *phb)
 {
 	int links_per_gpu = 0;
@@ -746,6 +831,8 @@  static void npu2_phb_final_fixup(struct phb *phb)
 	pci_walk_dev(phb, NULL, npu2_links_per_gpu, &links_per_gpu);
 	dt_for_each_compatible(dt_root, np, "ibm,power9-npu")
 		npu2_phb_fixup_scominit(np, links_per_gpu);
+
+	npu2_phb_nvlink_dt(phb, links_per_gpu);
 }
 
 static void npu2_init_ioda_cache(struct npu2 *p)