Message ID | 98baba04c43bce19f397e9d720ed39a0b10c4aa2.1544597914.git-series.andrew.donnellan@au1.ibm.com |
---|---|
State | Superseded |
Headers | show |
Series | Support OpenCAPI and NVLink devices on same NPU on Witherspoon | expand |
Context | Check | Description |
---|---|---|
snowpatch_ozlabs/apply_patch | success | master/apply_patch Successfully applied |
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot | success | Test snowpatch/job/snowpatch-skiboot on branch master |
Le 12/12/2018 à 07:58, Andrew Donnellan a écrit : > Rework NPU presence detection in preparation for supporting both NVLink and > OpenCAPI devices operating simultaneously on the same NPU. > > If an OpenCAPI card is connected to GPU#0, and an NVLink GPU is connected > to GPU#1, the GPU will only receive 2 links rather than the usual 3. > > The reason for this is that without the OpenCAPI card, the GPU would be > use links 3-5, connected to NPU bricks 3-5, which needs both stacks 1 and 2 > to be in NVLink mode. > > However, with an OpenCAPI card in the GPU#0 slot that uses links 0-1, we > need to use NPU bricks 2-3, which means stack 1 must be set in OpenCAPI > mode. As such, the GPU will be restricted to using links 4 and 5. > > Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com> > --- Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com> I can't wait to have that mess cleaned up on axone! > platforms/astbmc/witherspoon.c | 62 ++++++++++++++++++++++++++--------- > 1 file changed, 47 insertions(+), 15 deletions(-) > > diff --git a/platforms/astbmc/witherspoon.c b/platforms/astbmc/witherspoon.c > index fe138991696f..c41f0c5b1971 100644 > --- a/platforms/astbmc/witherspoon.c > +++ b/platforms/astbmc/witherspoon.c > @@ -233,6 +233,8 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu) > int rc; > > bool gpu0_present, gpu1_present; > + enum npu2_dev_type gpu0_type = NPU2_DEV_TYPE_UNKNOWN; > + enum npu2_dev_type gpu1_type = NPU2_DEV_TYPE_UNKNOWN; > > if (witherspoon_type != WITHERSPOON_TYPE_REDBUD) { > prlog(PR_DEBUG, "PLAT: Setting all NPU links to NVLink, OpenCAPI only supported on Redbud\n"); > @@ -298,19 +300,11 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu) > if (state & (1 << 0)) { > prlog(PR_DEBUG, "PLAT: Chip %d GPU#0 is OpenCAPI\n", > chip->id); > - /* > - * On witherspoon, bricks 2 and 3 are connected to > - * the lanes matching links 0 and 1 in OpenCAPI mode. > - */ > - set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI); > - /* We current don't support using the second link */ > - set_link_details(npu, 0, 2, NPU2_DEV_TYPE_UNKNOWN); > + gpu0_type = NPU2_DEV_TYPE_OPENCAPI; > } else { > prlog(PR_DEBUG, "PLAT: Chip %d GPU#0 is NVLink\n", > chip->id); > - set_link_details(npu, 0, 0, NPU2_DEV_TYPE_NVLINK); > - set_link_details(npu, 1, 1, NPU2_DEV_TYPE_NVLINK); > - set_link_details(npu, 2, 2, NPU2_DEV_TYPE_NVLINK); > + gpu0_type = NPU2_DEV_TYPE_NVLINK; > } > } > > @@ -318,16 +312,54 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu) > if (state & (1 << 1)) { > prlog(PR_DEBUG, "PLAT: Chip %d GPU#1 is OpenCAPI\n", > chip->id); > - set_link_details(npu, 4, 4, NPU2_DEV_TYPE_OPENCAPI); > - /* We current don't support using the second link */ > - set_link_details(npu, 5, 5, NPU2_DEV_TYPE_UNKNOWN); > + gpu1_type = NPU2_DEV_TYPE_OPENCAPI; > } else { > prlog(PR_DEBUG, "PLAT: Chip %d GPU#1 is NVLink\n", > chip->id); > + gpu1_type = NPU2_DEV_TYPE_NVLINK; > + } > + } > + > + if (gpu0_type == NPU2_DEV_TYPE_OPENCAPI) { > + set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI); > + /* We currently don't support using the second link */ > + set_link_details(npu, 0, 2, NPU2_DEV_TYPE_UNKNOWN); > + } > + > + if (gpu0_type == NPU2_DEV_TYPE_NVLINK) { > + set_link_details(npu, 0, 0, NPU2_DEV_TYPE_NVLINK); > + set_link_details(npu, 1, 1, NPU2_DEV_TYPE_NVLINK); > + set_link_details(npu, 2, 2, NPU2_DEV_TYPE_NVLINK); > + } > + > + if (gpu1_type == NPU2_DEV_TYPE_OPENCAPI) { > + set_link_details(npu, 4, 4, NPU2_DEV_TYPE_OPENCAPI); > + /* We currently don't support using the second link */ > + set_link_details(npu, 5, 5, NPU2_DEV_TYPE_UNKNOWN); > + } > + > + /* > + * If an OpenCAPI card is connected to GPU#0, and an NVLink GPU is > + * connected to GPU#1, the GPU will only receive 2 links rather than the > + * usual 3. > + * > + * The reason for this is that without the OpenCAPI card, the GPU would > + * be use links 3-5, connected to NPU bricks 3-5, which needs both > + * stacks 1 and 2 to be in NVLink mode. > + * > + * However, with an OpenCAPI card in the GPU#0 slot that uses links 0-1, > + * we need to use NPU bricks 2-3, which means stack 1 must be set in > + * OpenCAPI mode. As such, the GPU will be restricted to using links 4 > + * and 5. > + */ > + if (gpu1_type == NPU2_DEV_TYPE_NVLINK) { > + if (gpu0_type == NPU2_DEV_TYPE_OPENCAPI) { > + prlog(PR_WARNING, "PLAT: Chip %d GPU#1 will operate at reduced performance due to presence of OpenCAPI device. For optimal performance, swap device locations\n", chip->id); > + } else { > set_link_details(npu, 3, 3, NPU2_DEV_TYPE_NVLINK); > - set_link_details(npu, 4, 4, NPU2_DEV_TYPE_NVLINK); > - set_link_details(npu, 5, 5, NPU2_DEV_TYPE_NVLINK); > } > + set_link_details(npu, 4, 4, NPU2_DEV_TYPE_NVLINK); > + set_link_details(npu, 5, 5, NPU2_DEV_TYPE_NVLINK); > } > > return; >
diff --git a/platforms/astbmc/witherspoon.c b/platforms/astbmc/witherspoon.c index fe138991696f..c41f0c5b1971 100644 --- a/platforms/astbmc/witherspoon.c +++ b/platforms/astbmc/witherspoon.c @@ -233,6 +233,8 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu) int rc; bool gpu0_present, gpu1_present; + enum npu2_dev_type gpu0_type = NPU2_DEV_TYPE_UNKNOWN; + enum npu2_dev_type gpu1_type = NPU2_DEV_TYPE_UNKNOWN; if (witherspoon_type != WITHERSPOON_TYPE_REDBUD) { prlog(PR_DEBUG, "PLAT: Setting all NPU links to NVLink, OpenCAPI only supported on Redbud\n"); @@ -298,19 +300,11 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu) if (state & (1 << 0)) { prlog(PR_DEBUG, "PLAT: Chip %d GPU#0 is OpenCAPI\n", chip->id); - /* - * On witherspoon, bricks 2 and 3 are connected to - * the lanes matching links 0 and 1 in OpenCAPI mode. - */ - set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI); - /* We current don't support using the second link */ - set_link_details(npu, 0, 2, NPU2_DEV_TYPE_UNKNOWN); + gpu0_type = NPU2_DEV_TYPE_OPENCAPI; } else { prlog(PR_DEBUG, "PLAT: Chip %d GPU#0 is NVLink\n", chip->id); - set_link_details(npu, 0, 0, NPU2_DEV_TYPE_NVLINK); - set_link_details(npu, 1, 1, NPU2_DEV_TYPE_NVLINK); - set_link_details(npu, 2, 2, NPU2_DEV_TYPE_NVLINK); + gpu0_type = NPU2_DEV_TYPE_NVLINK; } } @@ -318,16 +312,54 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu) if (state & (1 << 1)) { prlog(PR_DEBUG, "PLAT: Chip %d GPU#1 is OpenCAPI\n", chip->id); - set_link_details(npu, 4, 4, NPU2_DEV_TYPE_OPENCAPI); - /* We current don't support using the second link */ - set_link_details(npu, 5, 5, NPU2_DEV_TYPE_UNKNOWN); + gpu1_type = NPU2_DEV_TYPE_OPENCAPI; } else { prlog(PR_DEBUG, "PLAT: Chip %d GPU#1 is NVLink\n", chip->id); + gpu1_type = NPU2_DEV_TYPE_NVLINK; + } + } + + if (gpu0_type == NPU2_DEV_TYPE_OPENCAPI) { + set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI); + /* We currently don't support using the second link */ + set_link_details(npu, 0, 2, NPU2_DEV_TYPE_UNKNOWN); + } + + if (gpu0_type == NPU2_DEV_TYPE_NVLINK) { + set_link_details(npu, 0, 0, NPU2_DEV_TYPE_NVLINK); + set_link_details(npu, 1, 1, NPU2_DEV_TYPE_NVLINK); + set_link_details(npu, 2, 2, NPU2_DEV_TYPE_NVLINK); + } + + if (gpu1_type == NPU2_DEV_TYPE_OPENCAPI) { + set_link_details(npu, 4, 4, NPU2_DEV_TYPE_OPENCAPI); + /* We currently don't support using the second link */ + set_link_details(npu, 5, 5, NPU2_DEV_TYPE_UNKNOWN); + } + + /* + * If an OpenCAPI card is connected to GPU#0, and an NVLink GPU is + * connected to GPU#1, the GPU will only receive 2 links rather than the + * usual 3. + * + * The reason for this is that without the OpenCAPI card, the GPU would + * be use links 3-5, connected to NPU bricks 3-5, which needs both + * stacks 1 and 2 to be in NVLink mode. + * + * However, with an OpenCAPI card in the GPU#0 slot that uses links 0-1, + * we need to use NPU bricks 2-3, which means stack 1 must be set in + * OpenCAPI mode. As such, the GPU will be restricted to using links 4 + * and 5. + */ + if (gpu1_type == NPU2_DEV_TYPE_NVLINK) { + if (gpu0_type == NPU2_DEV_TYPE_OPENCAPI) { + prlog(PR_WARNING, "PLAT: Chip %d GPU#1 will operate at reduced performance due to presence of OpenCAPI device. For optimal performance, swap device locations\n", chip->id); + } else { set_link_details(npu, 3, 3, NPU2_DEV_TYPE_NVLINK); - set_link_details(npu, 4, 4, NPU2_DEV_TYPE_NVLINK); - set_link_details(npu, 5, 5, NPU2_DEV_TYPE_NVLINK); } + set_link_details(npu, 4, 4, NPU2_DEV_TYPE_NVLINK); + set_link_details(npu, 5, 5, NPU2_DEV_TYPE_NVLINK); } return;
Rework NPU presence detection in preparation for supporting both NVLink and OpenCAPI devices operating simultaneously on the same NPU. If an OpenCAPI card is connected to GPU#0, and an NVLink GPU is connected to GPU#1, the GPU will only receive 2 links rather than the usual 3. The reason for this is that without the OpenCAPI card, the GPU would be use links 3-5, connected to NPU bricks 3-5, which needs both stacks 1 and 2 to be in NVLink mode. However, with an OpenCAPI card in the GPU#0 slot that uses links 0-1, we need to use NPU bricks 2-3, which means stack 1 must be set in OpenCAPI mode. As such, the GPU will be restricted to using links 4 and 5. Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com> --- platforms/astbmc/witherspoon.c | 62 ++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 15 deletions(-)