diff mbox series

capi: Poll Err/Status register during CAPP recovery

Message ID 20180122060812.391-1-vaibhav@linux.vnet.ibm.com
State Changes Requested
Headers show
Series capi: Poll Err/Status register during CAPP recovery | expand

Commit Message

Vaibhav Jain Jan. 22, 2018, 6:08 a.m. UTC
This patch updates do_capp_recovery_scoms() to poll the CAPP
Err/Status control register, check for CAPP-Recovery to
complete/fail based on indications of BITS-1,5,9 and then proceed with
the CAPP-Recovery scoms iif recovery completed successfully. This would
prevent cases where we bring-up the PCIe link while recovery sequencer
on CAPP is still busy with casting out cache lines.

In case CAPP-Recovery didn't complete successfully an error is returned
from do_capp_recovery_scoms() asking phb4_creset() to keep the phb4
fenced and mark it as broken.

The loop that implements polling of Err/Status register will also log
an error on the PHB when it continues for more than 168ms which is the
max time to failure for CAPP-Recovery.

Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
---
This patch is based on an earlier patch "capi: Perform capp recovery
sequence only when PBCQ is idle" available at
http://patchwork.ozlabs.org/patch/862771/
---
 hw/phb4.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 54 insertions(+), 8 deletions(-)

Comments

Christophe Lombard Jan. 22, 2018, 10:50 a.m. UTC | #1
Le 22/01/2018 à 07:08, Vaibhav Jain a écrit :
> This patch updates do_capp_recovery_scoms() to poll the CAPP
> Err/Status control register, check for CAPP-Recovery to
> complete/fail based on indications of BITS-1,5,9 and then proceed with
> the CAPP-Recovery scoms iif recovery completed successfully. This would
> prevent cases where we bring-up the PCIe link while recovery sequencer
> on CAPP is still busy with casting out cache lines.
> > In case CAPP-Recovery didn't complete successfully an error is returned
> from do_capp_recovery_scoms() asking phb4_creset() to keep the phb4
> fenced and mark it as broken.
> 
> The loop that implements polling of Err/Status register will also log
> an error on the PHB when it continues for more than 168ms which is the
> max time to failure for CAPP-Recovery.
> 
> Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
> ---
> This patch is based on an earlier patch "capi: Perform capp recovery
> sequence only when PBCQ is idle" available at
> http://patchwork.ozlabs.org/patch/862771/
> ---
>   hw/phb4.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
>   1 file changed, 54 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/phb4.c b/hw/phb4.c
> index 93ed1e06..23f923d7 100644
> --- a/hw/phb4.c
> +++ b/hw/phb4.c
> @@ -2829,25 +2829,70 @@ static int64_t load_capp_ucode(struct phb4 *p)
>   	return rc;
>   }
>   
> -static void do_capp_recovery_scoms(struct phb4 *p)
> +static int64_t do_capp_recovery_scoms(struct phb4 *p)
>   {
> -	uint64_t reg;
> -	uint32_t offset;
> +	uint64_t rc, reg, end;
> +	uint64_t offset = PHB4_CAPP_REG_OFFSET(p);
>   
> -	PHBDBG(p, "Doing CAPP recovery scoms\n");
> +	PHBDBG(p, "CAPP: Waiting for recovery to complete\n");
>   
> -	offset = PHB4_CAPP_REG_OFFSET(p);
> +	/* recovery timer failure period 168ms */
> +	end = mftb() + msecs_to_tb(168);
> +	xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
> +	while ((reg & (PPC_BIT(1) | PPC_BIT(5) | PPC_BIT(9))) == 0) {
> +

if the recovery failed, we must get ou of the loop

> +		time_wait_ms(5);
> +		xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
> +
> +		if (end && tb_compare(mftb(), end) != TB_AAFTERB) {
> +			PHBERR(p, "CAPP: Capp recovery Timed-out.\n");
> +			end = 0;
> +		}
> +	}
> +
> +	/* Check if the recovery failed or passed */
> +	if (reg & PPC_BIT(1)) {

You should also test bit(0) "If Error Recovery Initiated is 1 and this
bit is 1, the CAPP Recovery Process has completed"

> +		PHBDBG(p, "Doing CAPP recovery scoms\n");
>   		/* disable snoops */
>   		xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0);
> -	load_capp_ucode(p);
> +		rc = load_capp_ucode(p);
> +
> +		if (rc) {
> +			PHBERR(p, "CAPP: Unable to reload ucode\n");
> +			goto out;
> +		}
> +

We have never stopped do_capp_recovery_scoms() if load_capp_ucode(p) has 
failed (same on phb3). May be, there is a good reason. Need to 
investigate that.

>   		/* clear err rpt reg*/
>   		xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0);
> +
>   		/* clear capp fir */
>   		xscom_write(p->chip_id, CAPP_FIR + offset, 0);
>   
> +		/* Just reset Bit-0,1 and dont touch any other bit */
>   		xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
>   		reg &= ~(PPC_BIT(0) | PPC_BIT(1));
>   		xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg);
> +
> +	} else {
> +
> +		/* We will checkstop here due to FIR ACTION for
> +		 * failed recovery. So this message would never be logged.
> +		 * But if we still enter here then return an error forcing a
> +		 * fence of the PHB.
> +		 */
> +		if (reg  & PPC_BIT(5))
> +			PHBERR(p, "CAPP: Capp recovery Failed\n");
> +		else if (reg  & PPC_BIT(9))
> +			PHBERR(p, "CAPP: Capp recovery hang detected\n");
> +		else
> +			PHBERR(p, "CAPP: Unknown recovery failure\n");
> +
> +		PHBDGB("CAPP: Err/Status-reg=0x%016llx\n", reg);
> +		rc = OPAL_HARDWARE;

Same remark as previously. Any reason to mark the PHB as dead ?

> +	}
> +
> +out:
> +	return rc;
>   }
>   
>   static int64_t phb4_creset(struct pci_slot *slot)
> @@ -2906,8 +2951,9 @@ static int64_t phb4_creset(struct pci_slot *slot)
>   			PHBDBG(p, "CRESET: No pending transactions\n");
>   
>   			/* capp recovery */
> -			if (p->flags & PHB4_CAPP_RECOVERY)
> -				do_capp_recovery_scoms(p);
> +			if (p->flags & PHB4_CAPP_RECOVERY &&
> +			    do_capp_recovery_scoms(p))
> +				goto error;
>   
>   			/* Clear errors in PFIR and NFIR */
>   			xscom_write(p->chip_id, p->pci_stk_xscom + 0x1,
>
diff mbox series

Patch

diff --git a/hw/phb4.c b/hw/phb4.c
index 93ed1e06..23f923d7 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -2829,25 +2829,70 @@  static int64_t load_capp_ucode(struct phb4 *p)
 	return rc;
 }
 
-static void do_capp_recovery_scoms(struct phb4 *p)
+static int64_t do_capp_recovery_scoms(struct phb4 *p)
 {
-	uint64_t reg;
-	uint32_t offset;
+	uint64_t rc, reg, end;
+	uint64_t offset = PHB4_CAPP_REG_OFFSET(p);
 
-	PHBDBG(p, "Doing CAPP recovery scoms\n");
+	PHBDBG(p, "CAPP: Waiting for recovery to complete\n");
 
-	offset = PHB4_CAPP_REG_OFFSET(p);
+	/* recovery timer failure period 168ms */
+	end = mftb() + msecs_to_tb(168);
+	xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+	while ((reg & (PPC_BIT(1) | PPC_BIT(5) | PPC_BIT(9))) == 0) {
+
+		time_wait_ms(5);
+		xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+
+		if (end && tb_compare(mftb(), end) != TB_AAFTERB) {
+			PHBERR(p, "CAPP: Capp recovery Timed-out.\n");
+			end = 0;
+		}
+	}
+
+	/* Check if the recovery failed or passed */
+	if (reg & PPC_BIT(1)) {
+		PHBDBG(p, "Doing CAPP recovery scoms\n");
 		/* disable snoops */
 		xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0);
-	load_capp_ucode(p);
+		rc = load_capp_ucode(p);
+
+		if (rc) {
+			PHBERR(p, "CAPP: Unable to reload ucode\n");
+			goto out;
+		}
+
 		/* clear err rpt reg*/
 		xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0);
+
 		/* clear capp fir */
 		xscom_write(p->chip_id, CAPP_FIR + offset, 0);
 
+		/* Just reset Bit-0,1 and dont touch any other bit */
 		xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
 		reg &= ~(PPC_BIT(0) | PPC_BIT(1));
 		xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg);
+
+	} else {
+
+		/* We will checkstop here due to FIR ACTION for
+		 * failed recovery. So this message would never be logged.
+		 * But if we still enter here then return an error forcing a
+		 * fence of the PHB.
+		 */
+		if (reg  & PPC_BIT(5))
+			PHBERR(p, "CAPP: Capp recovery Failed\n");
+		else if (reg  & PPC_BIT(9))
+			PHBERR(p, "CAPP: Capp recovery hang detected\n");
+		else
+			PHBERR(p, "CAPP: Unknown recovery failure\n");
+
+		PHBDGB("CAPP: Err/Status-reg=0x%016llx\n", reg);
+		rc = OPAL_HARDWARE;
+	}
+
+out:
+	return rc;
 }
 
 static int64_t phb4_creset(struct pci_slot *slot)
@@ -2906,8 +2951,9 @@  static int64_t phb4_creset(struct pci_slot *slot)
 			PHBDBG(p, "CRESET: No pending transactions\n");
 
 			/* capp recovery */
-			if (p->flags & PHB4_CAPP_RECOVERY)
-				do_capp_recovery_scoms(p);
+			if (p->flags & PHB4_CAPP_RECOVERY &&
+			    do_capp_recovery_scoms(p))
+				goto error;
 
 			/* Clear errors in PFIR and NFIR */
 			xscom_write(p->chip_id, p->pci_stk_xscom + 0x1,