diff mbox series

[08/13] platforms/astbmc/witherspoon: Rework NPU presence detection

Message ID 98baba04c43bce19f397e9d720ed39a0b10c4aa2.1544597914.git-series.andrew.donnellan@au1.ibm.com
State Superseded
Headers show
Series Support OpenCAPI and NVLink devices on same NPU on Witherspoon | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master

Commit Message

Andrew Donnellan Dec. 12, 2018, 6:58 a.m. UTC
Rework NPU presence detection in preparation for supporting both NVLink and
OpenCAPI devices operating simultaneously on the same NPU.

If an OpenCAPI card is connected to GPU#0, and an NVLink GPU is connected
to GPU#1, the GPU will only receive 2 links rather than the usual 3.

The reason for this is that without the OpenCAPI card, the GPU would be
use links 3-5, connected to NPU bricks 3-5, which needs both stacks 1 and 2
to be in NVLink mode.

However, with an OpenCAPI card in the GPU#0 slot that uses links 0-1, we
need to use NPU bricks 2-3, which means stack 1 must be set in OpenCAPI
mode. As such, the GPU will be restricted to using links 4 and 5.

Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
---
 platforms/astbmc/witherspoon.c | 62 ++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 15 deletions(-)

Comments

Frederic Barrat Jan. 8, 2019, 11:51 a.m. UTC | #1
Le 12/12/2018 à 07:58, Andrew Donnellan a écrit :
> Rework NPU presence detection in preparation for supporting both NVLink and
> OpenCAPI devices operating simultaneously on the same NPU.
> 
> If an OpenCAPI card is connected to GPU#0, and an NVLink GPU is connected
> to GPU#1, the GPU will only receive 2 links rather than the usual 3.
> 
> The reason for this is that without the OpenCAPI card, the GPU would be
> use links 3-5, connected to NPU bricks 3-5, which needs both stacks 1 and 2
> to be in NVLink mode.
> 
> However, with an OpenCAPI card in the GPU#0 slot that uses links 0-1, we
> need to use NPU bricks 2-3, which means stack 1 must be set in OpenCAPI
> mode. As such, the GPU will be restricted to using links 4 and 5.
> 
> Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
> ---

Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>

I can't wait to have that mess cleaned up on axone!


>   platforms/astbmc/witherspoon.c | 62 ++++++++++++++++++++++++++---------
>   1 file changed, 47 insertions(+), 15 deletions(-)
> 
> diff --git a/platforms/astbmc/witherspoon.c b/platforms/astbmc/witherspoon.c
> index fe138991696f..c41f0c5b1971 100644
> --- a/platforms/astbmc/witherspoon.c
> +++ b/platforms/astbmc/witherspoon.c
> @@ -233,6 +233,8 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu)
>   	int rc;
> 
>   	bool gpu0_present, gpu1_present;
> +	enum npu2_dev_type gpu0_type = NPU2_DEV_TYPE_UNKNOWN;
> +	enum npu2_dev_type gpu1_type = NPU2_DEV_TYPE_UNKNOWN;
> 
>   	if (witherspoon_type != WITHERSPOON_TYPE_REDBUD) {
>   		prlog(PR_DEBUG, "PLAT: Setting all NPU links to NVLink, OpenCAPI only supported on Redbud\n");
> @@ -298,19 +300,11 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu)
>   		if (state & (1 << 0)) {
>   			prlog(PR_DEBUG, "PLAT: Chip %d GPU#0 is OpenCAPI\n",
>   			      chip->id);
> -			/*
> -			 * On witherspoon, bricks 2 and 3 are connected to
> -			 * the lanes matching links 0 and 1 in OpenCAPI mode.
> -			 */
> -			set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI);
> -			/* We current don't support using the second link */
> -			set_link_details(npu, 0, 2, NPU2_DEV_TYPE_UNKNOWN);
> +			gpu0_type = NPU2_DEV_TYPE_OPENCAPI;
>   		} else {
>   			prlog(PR_DEBUG, "PLAT: Chip %d GPU#0 is NVLink\n",
>   			      chip->id);
> -			set_link_details(npu, 0, 0, NPU2_DEV_TYPE_NVLINK);
> -			set_link_details(npu, 1, 1, NPU2_DEV_TYPE_NVLINK);
> -			set_link_details(npu, 2, 2, NPU2_DEV_TYPE_NVLINK);
> +			gpu0_type = NPU2_DEV_TYPE_NVLINK;
>   		}
>   	}
> 
> @@ -318,16 +312,54 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu)
>   		if (state & (1 << 1)) {
>   			prlog(PR_DEBUG, "PLAT: Chip %d GPU#1 is OpenCAPI\n",
>   			      chip->id);
> -			set_link_details(npu, 4, 4, NPU2_DEV_TYPE_OPENCAPI);
> -			/* We current don't support using the second link */
> -			set_link_details(npu, 5, 5, NPU2_DEV_TYPE_UNKNOWN);
> +			gpu1_type = NPU2_DEV_TYPE_OPENCAPI;
>   		} else {
>   			prlog(PR_DEBUG, "PLAT: Chip %d GPU#1 is NVLink\n",
>   			      chip->id);
> +			gpu1_type = NPU2_DEV_TYPE_NVLINK;
> +		}
> +	}
> +
> +	if (gpu0_type == NPU2_DEV_TYPE_OPENCAPI) {
> +		set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI);
> +		/* We currently don't support using the second link */
> +		set_link_details(npu, 0, 2, NPU2_DEV_TYPE_UNKNOWN);
> +	}
> +
> +	if (gpu0_type == NPU2_DEV_TYPE_NVLINK) {
> +		set_link_details(npu, 0, 0, NPU2_DEV_TYPE_NVLINK);
> +		set_link_details(npu, 1, 1, NPU2_DEV_TYPE_NVLINK);
> +		set_link_details(npu, 2, 2, NPU2_DEV_TYPE_NVLINK);
> +	}
> +
> +	if (gpu1_type == NPU2_DEV_TYPE_OPENCAPI) {
> +		set_link_details(npu, 4, 4, NPU2_DEV_TYPE_OPENCAPI);
> +		/* We currently don't support using the second link */
> +		set_link_details(npu, 5, 5, NPU2_DEV_TYPE_UNKNOWN);
> +	}
> +
> +	/*
> +	 * If an OpenCAPI card is connected to GPU#0, and an NVLink GPU is
> +	 * connected to GPU#1, the GPU will only receive 2 links rather than the
> +	 * usual 3.
> +	 *
> +	 * The reason for this is that without the OpenCAPI card, the GPU would
> +	 * be use links 3-5, connected to NPU bricks 3-5, which needs both
> +	 * stacks 1 and 2 to be in NVLink mode.
> +	 *
> +	 * However, with an OpenCAPI card in the GPU#0 slot that uses links 0-1,
> +	 * we need to use NPU bricks 2-3, which means stack 1 must be set in
> +	 * OpenCAPI mode. As such, the GPU will be restricted to using links 4
> +	 * and 5.
> +	 */
> +	if (gpu1_type == NPU2_DEV_TYPE_NVLINK) {
> +		if (gpu0_type == NPU2_DEV_TYPE_OPENCAPI) {
> +			prlog(PR_WARNING, "PLAT: Chip %d GPU#1 will operate at reduced performance due to presence of OpenCAPI device. For optimal performance, swap device locations\n", chip->id);
> +		} else {
>   			set_link_details(npu, 3, 3, NPU2_DEV_TYPE_NVLINK);
> -			set_link_details(npu, 4, 4, NPU2_DEV_TYPE_NVLINK);
> -			set_link_details(npu, 5, 5, NPU2_DEV_TYPE_NVLINK);
>   		}
> +		set_link_details(npu, 4, 4, NPU2_DEV_TYPE_NVLINK);
> +		set_link_details(npu, 5, 5, NPU2_DEV_TYPE_NVLINK);
>   	}
> 
>   	return;
>
diff mbox series

Patch

diff --git a/platforms/astbmc/witherspoon.c b/platforms/astbmc/witherspoon.c
index fe138991696f..c41f0c5b1971 100644
--- a/platforms/astbmc/witherspoon.c
+++ b/platforms/astbmc/witherspoon.c
@@ -233,6 +233,8 @@  static void witherspoon_npu2_device_detect(struct npu2 *npu)
 	int rc;
 
 	bool gpu0_present, gpu1_present;
+	enum npu2_dev_type gpu0_type = NPU2_DEV_TYPE_UNKNOWN;
+	enum npu2_dev_type gpu1_type = NPU2_DEV_TYPE_UNKNOWN;
 
 	if (witherspoon_type != WITHERSPOON_TYPE_REDBUD) {
 		prlog(PR_DEBUG, "PLAT: Setting all NPU links to NVLink, OpenCAPI only supported on Redbud\n");
@@ -298,19 +300,11 @@  static void witherspoon_npu2_device_detect(struct npu2 *npu)
 		if (state & (1 << 0)) {
 			prlog(PR_DEBUG, "PLAT: Chip %d GPU#0 is OpenCAPI\n",
 			      chip->id);
-			/*
-			 * On witherspoon, bricks 2 and 3 are connected to
-			 * the lanes matching links 0 and 1 in OpenCAPI mode.
-			 */
-			set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI);
-			/* We current don't support using the second link */
-			set_link_details(npu, 0, 2, NPU2_DEV_TYPE_UNKNOWN);
+			gpu0_type = NPU2_DEV_TYPE_OPENCAPI;
 		} else {
 			prlog(PR_DEBUG, "PLAT: Chip %d GPU#0 is NVLink\n",
 			      chip->id);
-			set_link_details(npu, 0, 0, NPU2_DEV_TYPE_NVLINK);
-			set_link_details(npu, 1, 1, NPU2_DEV_TYPE_NVLINK);
-			set_link_details(npu, 2, 2, NPU2_DEV_TYPE_NVLINK);
+			gpu0_type = NPU2_DEV_TYPE_NVLINK;
 		}
 	}
 
@@ -318,16 +312,54 @@  static void witherspoon_npu2_device_detect(struct npu2 *npu)
 		if (state & (1 << 1)) {
 			prlog(PR_DEBUG, "PLAT: Chip %d GPU#1 is OpenCAPI\n",
 			      chip->id);
-			set_link_details(npu, 4, 4, NPU2_DEV_TYPE_OPENCAPI);
-			/* We current don't support using the second link */
-			set_link_details(npu, 5, 5, NPU2_DEV_TYPE_UNKNOWN);
+			gpu1_type = NPU2_DEV_TYPE_OPENCAPI;
 		} else {
 			prlog(PR_DEBUG, "PLAT: Chip %d GPU#1 is NVLink\n",
 			      chip->id);
+			gpu1_type = NPU2_DEV_TYPE_NVLINK;
+		}
+	}
+
+	if (gpu0_type == NPU2_DEV_TYPE_OPENCAPI) {
+		set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI);
+		/* We currently don't support using the second link */
+		set_link_details(npu, 0, 2, NPU2_DEV_TYPE_UNKNOWN);
+	}
+
+	if (gpu0_type == NPU2_DEV_TYPE_NVLINK) {
+		set_link_details(npu, 0, 0, NPU2_DEV_TYPE_NVLINK);
+		set_link_details(npu, 1, 1, NPU2_DEV_TYPE_NVLINK);
+		set_link_details(npu, 2, 2, NPU2_DEV_TYPE_NVLINK);
+	}
+
+	if (gpu1_type == NPU2_DEV_TYPE_OPENCAPI) {
+		set_link_details(npu, 4, 4, NPU2_DEV_TYPE_OPENCAPI);
+		/* We currently don't support using the second link */
+		set_link_details(npu, 5, 5, NPU2_DEV_TYPE_UNKNOWN);
+	}
+
+	/*
+	 * If an OpenCAPI card is connected to GPU#0, and an NVLink GPU is
+	 * connected to GPU#1, the GPU will only receive 2 links rather than the
+	 * usual 3.
+	 *
+	 * The reason for this is that without the OpenCAPI card, the GPU would
+	 * be use links 3-5, connected to NPU bricks 3-5, which needs both
+	 * stacks 1 and 2 to be in NVLink mode.
+	 *
+	 * However, with an OpenCAPI card in the GPU#0 slot that uses links 0-1,
+	 * we need to use NPU bricks 2-3, which means stack 1 must be set in
+	 * OpenCAPI mode. As such, the GPU will be restricted to using links 4
+	 * and 5.
+	 */
+	if (gpu1_type == NPU2_DEV_TYPE_NVLINK) {
+		if (gpu0_type == NPU2_DEV_TYPE_OPENCAPI) {
+			prlog(PR_WARNING, "PLAT: Chip %d GPU#1 will operate at reduced performance due to presence of OpenCAPI device. For optimal performance, swap device locations\n", chip->id);
+		} else {
 			set_link_details(npu, 3, 3, NPU2_DEV_TYPE_NVLINK);
-			set_link_details(npu, 4, 4, NPU2_DEV_TYPE_NVLINK);
-			set_link_details(npu, 5, 5, NPU2_DEV_TYPE_NVLINK);
 		}
+		set_link_details(npu, 4, 4, NPU2_DEV_TYPE_NVLINK);
+		set_link_details(npu, 5, 5, NPU2_DEV_TYPE_NVLINK);
 	}
 
 	return;