diff mbox series

[1/2] phb4: Reallocate PEC2 DMA-Read engines to improve GPU-Direct bandwidth

Message ID 20180719075500.17822-1-vaibhav@linux.ibm.com
State Accepted
Headers show
Series [1/2] phb4: Reallocate PEC2 DMA-Read engines to improve GPU-Direct bandwidth | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied

Commit Message

Vaibhav Jain July 19, 2018, 7:54 a.m. UTC
We reallocate additional 16/8 DMA-Read engines allocated to stack0/1
on PEC2 respectively. This is needed to improve bandwidth available to
the Mellanox CX5 adapter when trying to read GPU memory (GPU-Direct).

If kernel cxl driver indicates a request to allocate maximum possible
DMA read engines when calling enable_capi_mode() and card is attached
to PEC2/stack0 slot then we assume its a Mellanox CX5 adapter. We then
allocate additional 16/8 extra DMA read engines to stack0 and stack1
respectively on PEC2. This is done by populating the
XPEC_PCI_PRDSTKOVR and XPEC_NEST_READ_STACK_OVERRIDE as suggested by
the h/w team.

Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
---
 hw/phb4.c           | 40 +++++++++++++++++++++++++++++++++++++---
 include/phb4-regs.h |  2 ++
 2 files changed, 39 insertions(+), 3 deletions(-)

Comments

Andrew Donnellan July 19, 2018, 8:04 a.m. UTC | #1
On 19/07/18 17:54, Vaibhav Jain wrote:
> We reallocate additional 16/8 DMA-Read engines allocated to stack0/1
> on PEC2 respectively. This is needed to improve bandwidth available to
> the Mellanox CX5 adapter when trying to read GPU memory (GPU-Direct).
> 
> If kernel cxl driver indicates a request to allocate maximum possible
> DMA read engines when calling enable_capi_mode() and card is attached
> to PEC2/stack0 slot then we assume its a Mellanox CX5 adapter. We then
> allocate additional 16/8 extra DMA read engines to stack0 and stack1
> respectively on PEC2. This is done by populating the
> XPEC_PCI_PRDSTKOVR and XPEC_NEST_READ_STACK_OVERRIDE as suggested by
> the h/w team.
> 
> Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>

Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

> ---
>   hw/phb4.c           | 40 +++++++++++++++++++++++++++++++++++++---
>   include/phb4-regs.h |  2 ++
>   2 files changed, 39 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/phb4.c b/hw/phb4.c
> index 8ba3eb78..f2b92409 100644
> --- a/hw/phb4.c
> +++ b/hw/phb4.c
> @@ -3806,7 +3806,7 @@ static void phb4_init_capp_regs(struct phb4 *p, uint32_t capp_eng)
>   		/* max PHB read buffers 0-47 */
>   		reg = 0xFFFFFFFFFFFF0000;
>   		if (capp_eng & CAPP_MAX_DMA_READ_ENGINES)
> -			reg = 0xFF00000000000000;
> +			reg = 0xF000000000000000;
>   		xscom_write(p->chip_id, APC_FSM_READ_MASK + offset, reg);
>   		xscom_write(p->chip_id, XPT_FSM_RMM + offset, reg);
>   	}
> @@ -3814,7 +3814,7 @@ static void phb4_init_capp_regs(struct phb4 *p, uint32_t capp_eng)
>   		/* Set 30 Read machines for CAPP Minus 20-27 for DMA */
>   		reg = 0xFFFFF00E00000000;
>   		if (capp_eng & CAPP_MAX_DMA_READ_ENGINES)
> -			reg = 0xFF00000000000000;
> +			reg = 0xF000000000000000;
>   		xscom_write(p->chip_id, APC_FSM_READ_MASK + offset, reg);
>   		xscom_write(p->chip_id, XPT_FSM_RMM + offset, reg);
>   	}
> @@ -3932,6 +3932,8 @@ static int64_t enable_capi_mode(struct phb4 *p, uint64_t pe_number,
>   		return OPAL_HARDWARE;
>   	}
>   
> +	stq_eng = 0x0000000000000000ULL;
> +	dma_eng = 0x0000000000000000ULL;
>   	if (p->index == CAPP0_PHB_INDEX) {
>   		/* PBCQ is operating as a x16 stack
>   		 * - The maximum number of engines give to CAPP will be
> @@ -3974,10 +3976,42 @@ static int64_t enable_capi_mode(struct phb4 *p, uint64_t pe_number,
>   	reg = 0x8000000000000000ULL; /* PEC works in CAPP Mode */
>   	reg |= stq_eng;
>   	if (capp_eng & CAPP_MAX_DMA_READ_ENGINES)
> -		dma_eng = 0x0000FF0000000000ULL; /* 16 CAPP Read machines */
> +		dma_eng = 0x0000F00000000000ULL; /* 4 CAPP Read machines */

For the record Vaibhav explained that the old comment here was 
inaccurate, it's being changed from 8->4 rather than 16->4.

>   	reg |= dma_eng;
>   	xscom_write(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, reg);
>   
> +	/* PEC2 has 3 ETU's + 16 pci lanes that can operate as x16,
> +	 * x8+x8 (bifurcated) or x8+x4+x4 (trifurcated) mode. When
> +	 * Mellanox CX5 card is attached to stack0 of this PEC, indicated by
> +	 * request to allocate CAPP_MAX_DMA_READ_ENGINES; we tweak the default
> +	 * dma-read engines allocations to maximize the DMA read performance
> +	 */

Thanks for this comment, it's much clearer than v1.

> +	if ((p->index == CAPP1_PHB_INDEX) &&
> +	    (capp_eng & CAPP_MAX_DMA_READ_ENGINES)) {
> +
> +		/*
> +		 * Allocate Additional 16/8 dma read engines to stack0/stack1
> +		 * respectively. Read engines 0:31 are anyways always assigned
> +		 * to stack0. Also skip allocating DMA Read Engine-32 by
> +		 * enabling Bit[0] in XPEC_NEST_READ_STACK_OVERRIDE register.
> +		 * Enabling this bit seems cause a parity error reported in
> +		 * NFIR[1]-nonbar_pe.
> +		 */
> +		reg = 0x7fff80007F008000ULL;
> +
> +		xscom_write(p->chip_id, p->pci_xscom + XPEC_PCI_PRDSTKOVR, reg);
> +		xscom_write(p->chip_id, p->pe_xscom +
> +			    XPEC_NEST_READ_STACK_OVERRIDE, reg);
> +
> +		/* Log this reallocation as it may impact dma performance of
> +		 * other slots connected to PEC2
> +		 */
> +		PHBINF(p, "CAPP: Set %d dma-read engines for PEC2/stack-0\n",
> +		      32 + __builtin_popcountll(reg & PPC_BITMASK(0, 31)));
> +		PHBDBG(p, "CAPP: XPEC_NEST_READ_STACK_OVERRIDE: %016llx\n",
> +		       reg);
> +	}
> +
>   	/* PCI to PB data movement ignores the PB init signal. */
>   	xscom_write_mask(p->chip_id, p->pe_xscom + XPEC_NEST_PBCQ_HW_CONFIG,
>   			 XPEC_NEST_PBCQ_HW_CONFIG_PBINIT,
> diff --git a/include/phb4-regs.h b/include/phb4-regs.h
> index d7b551f3..ef3cfa93 100644
> --- a/include/phb4-regs.h
> +++ b/include/phb4-regs.h
> @@ -347,6 +347,7 @@
>   #define   XPEC_NEST_PBCQ_HW_CONFIG_DIS_NODAL	PPC_BIT(50)
>   #define   XPEC_NEST_PBCQ_HW_CONFIG_DIS_RNNN	PPC_BIT(52)
>   #define XPEC_NEST_CAPP_CNTL			0x7
> +#define XPEC_NEST_READ_STACK_OVERRIDE		0x8
>   
>   /* Nest base per-stack registers */
>   #define XPEC_NEST_STK_PCI_NFIR			0x0
> @@ -381,6 +382,7 @@
>   /* PCI base registers */
>   #define XPEC_PCI_PBAIB_HW_CONFIG		0x0
>   #define XPEC_PCI_CAPP_SEC_BAR			0x1
> +#define XPEC_PCI_PRDSTKOVR			0x2
>   
>   /* PCI base per-stack registers */
>   #define XPEC_PCI_STK_PCI_FIR			0x0
>
Stewart Smith July 19, 2018, 9:48 a.m. UTC | #2
Vaibhav Jain <vaibhav@linux.ibm.com> writes:
> We reallocate additional 16/8 DMA-Read engines allocated to stack0/1
> on PEC2 respectively. This is needed to improve bandwidth available to
> the Mellanox CX5 adapter when trying to read GPU memory (GPU-Direct).
>
> If kernel cxl driver indicates a request to allocate maximum possible
> DMA read engines when calling enable_capi_mode() and card is attached
> to PEC2/stack0 slot then we assume its a Mellanox CX5 adapter. We then
> allocate additional 16/8 extra DMA read engines to stack0 and stack1
> respectively on PEC2. This is done by populating the
> XPEC_PCI_PRDSTKOVR and XPEC_NEST_READ_STACK_OVERRIDE as suggested by
> the h/w team.
>
> Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
> ---
>  hw/phb4.c           | 40 +++++++++++++++++++++++++++++++++++++---
>  include/phb4-regs.h |  2 ++
>  2 files changed, 39 insertions(+), 3 deletions(-)

thanks, and thanks for addressing things Andrew and I brought up in that
random internal Slack thread to get this ready to go in so quickly.

Merged to master as of 5690c5a8980faf9e528df65dd95535e21c2c868f
and to 6.0.x as of b50653f4c4707f3406269573bc421df3cffce950

I fixed the obvious nitpicks in the doc with merging... although I
didn't do the "nor on P8 for set_mode_pcie" as I'm not 100% sure of that
and wanted to check (I thought we did that... or maybe we just *think*
we can and reset things appropriately in fast-reboot - or I'm just plain
wrong and shouldn't be allowed near computers).
Andrew Donnellan July 20, 2018, 12:51 a.m. UTC | #3
On 19/07/18 19:48, Stewart Smith wrote:
> Vaibhav Jain <vaibhav@linux.ibm.com> writes:
>> We reallocate additional 16/8 DMA-Read engines allocated to stack0/1
>> on PEC2 respectively. This is needed to improve bandwidth available to
>> the Mellanox CX5 adapter when trying to read GPU memory (GPU-Direct).
>>
>> If kernel cxl driver indicates a request to allocate maximum possible
>> DMA read engines when calling enable_capi_mode() and card is attached
>> to PEC2/stack0 slot then we assume its a Mellanox CX5 adapter. We then
>> allocate additional 16/8 extra DMA read engines to stack0 and stack1
>> respectively on PEC2. This is done by populating the
>> XPEC_PCI_PRDSTKOVR and XPEC_NEST_READ_STACK_OVERRIDE as suggested by
>> the h/w team.
>>
>> Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
>> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
>> ---
>>   hw/phb4.c           | 40 +++++++++++++++++++++++++++++++++++++---
>>   include/phb4-regs.h |  2 ++
>>   2 files changed, 39 insertions(+), 3 deletions(-)
> 
> thanks, and thanks for addressing things Andrew and I brought up in that
> random internal Slack thread to get this ready to go in so quickly.
> 
> Merged to master as of 5690c5a8980faf9e528df65dd95535e21c2c868f
> and to 6.0.x as of b50653f4c4707f3406269573bc421df3cffce950
> 
> I fixed the obvious nitpicks in the doc with merging... although I
> didn't do the "nor on P8 for set_mode_pcie" as I'm not 100% sure of that
> and wanted to check (I thought we did that... or maybe we just *think*
> we can and reset things appropriately in fast-reboot - or I'm just plain
> wrong and shouldn't be allowed near computers).
> 

Thanks.

As for pcie mode on p8, see phb3.c:3695. To switch back, we have to do a 
CRESET on the PHB instead.
diff mbox series

Patch

diff --git a/hw/phb4.c b/hw/phb4.c
index 8ba3eb78..f2b92409 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -3806,7 +3806,7 @@  static void phb4_init_capp_regs(struct phb4 *p, uint32_t capp_eng)
 		/* max PHB read buffers 0-47 */
 		reg = 0xFFFFFFFFFFFF0000;
 		if (capp_eng & CAPP_MAX_DMA_READ_ENGINES)
-			reg = 0xFF00000000000000;
+			reg = 0xF000000000000000;
 		xscom_write(p->chip_id, APC_FSM_READ_MASK + offset, reg);
 		xscom_write(p->chip_id, XPT_FSM_RMM + offset, reg);
 	}
@@ -3814,7 +3814,7 @@  static void phb4_init_capp_regs(struct phb4 *p, uint32_t capp_eng)
 		/* Set 30 Read machines for CAPP Minus 20-27 for DMA */
 		reg = 0xFFFFF00E00000000;
 		if (capp_eng & CAPP_MAX_DMA_READ_ENGINES)
-			reg = 0xFF00000000000000;
+			reg = 0xF000000000000000;
 		xscom_write(p->chip_id, APC_FSM_READ_MASK + offset, reg);
 		xscom_write(p->chip_id, XPT_FSM_RMM + offset, reg);
 	}
@@ -3932,6 +3932,8 @@  static int64_t enable_capi_mode(struct phb4 *p, uint64_t pe_number,
 		return OPAL_HARDWARE;
 	}
 
+	stq_eng = 0x0000000000000000ULL;
+	dma_eng = 0x0000000000000000ULL;
 	if (p->index == CAPP0_PHB_INDEX) {
 		/* PBCQ is operating as a x16 stack
 		 * - The maximum number of engines give to CAPP will be
@@ -3974,10 +3976,42 @@  static int64_t enable_capi_mode(struct phb4 *p, uint64_t pe_number,
 	reg = 0x8000000000000000ULL; /* PEC works in CAPP Mode */
 	reg |= stq_eng;
 	if (capp_eng & CAPP_MAX_DMA_READ_ENGINES)
-		dma_eng = 0x0000FF0000000000ULL; /* 16 CAPP Read machines */
+		dma_eng = 0x0000F00000000000ULL; /* 4 CAPP Read machines */
 	reg |= dma_eng;
 	xscom_write(p->chip_id, p->pe_xscom + XPEC_NEST_CAPP_CNTL, reg);
 
+	/* PEC2 has 3 ETU's + 16 pci lanes that can operate as x16,
+	 * x8+x8 (bifurcated) or x8+x4+x4 (trifurcated) mode. When
+	 * Mellanox CX5 card is attached to stack0 of this PEC, indicated by
+	 * request to allocate CAPP_MAX_DMA_READ_ENGINES; we tweak the default
+	 * dma-read engines allocations to maximize the DMA read performance
+	 */
+	if ((p->index == CAPP1_PHB_INDEX) &&
+	    (capp_eng & CAPP_MAX_DMA_READ_ENGINES)) {
+
+		/*
+		 * Allocate Additional 16/8 dma read engines to stack0/stack1
+		 * respectively. Read engines 0:31 are anyways always assigned
+		 * to stack0. Also skip allocating DMA Read Engine-32 by
+		 * enabling Bit[0] in XPEC_NEST_READ_STACK_OVERRIDE register.
+		 * Enabling this bit seems cause a parity error reported in
+		 * NFIR[1]-nonbar_pe.
+		 */
+		reg = 0x7fff80007F008000ULL;
+
+		xscom_write(p->chip_id, p->pci_xscom + XPEC_PCI_PRDSTKOVR, reg);
+		xscom_write(p->chip_id, p->pe_xscom +
+			    XPEC_NEST_READ_STACK_OVERRIDE, reg);
+
+		/* Log this reallocation as it may impact dma performance of
+		 * other slots connected to PEC2
+		 */
+		PHBINF(p, "CAPP: Set %d dma-read engines for PEC2/stack-0\n",
+		      32 + __builtin_popcountll(reg & PPC_BITMASK(0, 31)));
+		PHBDBG(p, "CAPP: XPEC_NEST_READ_STACK_OVERRIDE: %016llx\n",
+		       reg);
+	}
+
 	/* PCI to PB data movement ignores the PB init signal. */
 	xscom_write_mask(p->chip_id, p->pe_xscom + XPEC_NEST_PBCQ_HW_CONFIG,
 			 XPEC_NEST_PBCQ_HW_CONFIG_PBINIT,
diff --git a/include/phb4-regs.h b/include/phb4-regs.h
index d7b551f3..ef3cfa93 100644
--- a/include/phb4-regs.h
+++ b/include/phb4-regs.h
@@ -347,6 +347,7 @@ 
 #define   XPEC_NEST_PBCQ_HW_CONFIG_DIS_NODAL	PPC_BIT(50)
 #define   XPEC_NEST_PBCQ_HW_CONFIG_DIS_RNNN	PPC_BIT(52)
 #define XPEC_NEST_CAPP_CNTL			0x7
+#define XPEC_NEST_READ_STACK_OVERRIDE		0x8
 
 /* Nest base per-stack registers */
 #define XPEC_NEST_STK_PCI_NFIR			0x0
@@ -381,6 +382,7 @@ 
 /* PCI base registers */
 #define XPEC_PCI_PBAIB_HW_CONFIG		0x0
 #define XPEC_PCI_CAPP_SEC_BAR			0x1
+#define XPEC_PCI_PRDSTKOVR			0x2
 
 /* PCI base per-stack registers */
 #define XPEC_PCI_STK_PCI_FIR			0x0