Message ID | 20180122060812.391-1-vaibhav@linux.vnet.ibm.com |
---|---|
State | Changes Requested |
Headers | show |
Series | capi: Poll Err/Status register during CAPP recovery | expand |
Le 22/01/2018 à 07:08, Vaibhav Jain a écrit : > This patch updates do_capp_recovery_scoms() to poll the CAPP > Err/Status control register, check for CAPP-Recovery to > complete/fail based on indications of BITS-1,5,9 and then proceed with > the CAPP-Recovery scoms iif recovery completed successfully. This would > prevent cases where we bring-up the PCIe link while recovery sequencer > on CAPP is still busy with casting out cache lines. > > In case CAPP-Recovery didn't complete successfully an error is returned > from do_capp_recovery_scoms() asking phb4_creset() to keep the phb4 > fenced and mark it as broken. > > The loop that implements polling of Err/Status register will also log > an error on the PHB when it continues for more than 168ms which is the > max time to failure for CAPP-Recovery. > > Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com> > --- > This patch is based on an earlier patch "capi: Perform capp recovery > sequence only when PBCQ is idle" available at > http://patchwork.ozlabs.org/patch/862771/ > --- > hw/phb4.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 54 insertions(+), 8 deletions(-) > > diff --git a/hw/phb4.c b/hw/phb4.c > index 93ed1e06..23f923d7 100644 > --- a/hw/phb4.c > +++ b/hw/phb4.c > @@ -2829,25 +2829,70 @@ static int64_t load_capp_ucode(struct phb4 *p) > return rc; > } > > -static void do_capp_recovery_scoms(struct phb4 *p) > +static int64_t do_capp_recovery_scoms(struct phb4 *p) > { > - uint64_t reg; > - uint32_t offset; > + uint64_t rc, reg, end; > + uint64_t offset = PHB4_CAPP_REG_OFFSET(p); > > - PHBDBG(p, "Doing CAPP recovery scoms\n"); > + PHBDBG(p, "CAPP: Waiting for recovery to complete\n"); > > - offset = PHB4_CAPP_REG_OFFSET(p); > + /* recovery timer failure period 168ms */ > + end = mftb() + msecs_to_tb(168); > + xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); > + while ((reg & (PPC_BIT(1) | PPC_BIT(5) | PPC_BIT(9))) == 0) { > + if the recovery failed, we must get ou of the loop > + time_wait_ms(5); > + xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); > + > + if (end && tb_compare(mftb(), end) != TB_AAFTERB) { > + PHBERR(p, "CAPP: Capp recovery Timed-out.\n"); > + end = 0; > + } > + } > + > + /* Check if the recovery failed or passed */ > + if (reg & PPC_BIT(1)) { You should also test bit(0) "If Error Recovery Initiated is 1 and this bit is 1, the CAPP Recovery Process has completed" > + PHBDBG(p, "Doing CAPP recovery scoms\n"); > /* disable snoops */ > xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0); > - load_capp_ucode(p); > + rc = load_capp_ucode(p); > + > + if (rc) { > + PHBERR(p, "CAPP: Unable to reload ucode\n"); > + goto out; > + } > + We have never stopped do_capp_recovery_scoms() if load_capp_ucode(p) has failed (same on phb3). May be, there is a good reason. Need to investigate that. > /* clear err rpt reg*/ > xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0); > + > /* clear capp fir */ > xscom_write(p->chip_id, CAPP_FIR + offset, 0); > > + /* Just reset Bit-0,1 and dont touch any other bit */ > xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); > reg &= ~(PPC_BIT(0) | PPC_BIT(1)); > xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg); > + > + } else { > + > + /* We will checkstop here due to FIR ACTION for > + * failed recovery. So this message would never be logged. > + * But if we still enter here then return an error forcing a > + * fence of the PHB. > + */ > + if (reg & PPC_BIT(5)) > + PHBERR(p, "CAPP: Capp recovery Failed\n"); > + else if (reg & PPC_BIT(9)) > + PHBERR(p, "CAPP: Capp recovery hang detected\n"); > + else > + PHBERR(p, "CAPP: Unknown recovery failure\n"); > + > + PHBDGB("CAPP: Err/Status-reg=0x%016llx\n", reg); > + rc = OPAL_HARDWARE; Same remark as previously. Any reason to mark the PHB as dead ? > + } > + > +out: > + return rc; > } > > static int64_t phb4_creset(struct pci_slot *slot) > @@ -2906,8 +2951,9 @@ static int64_t phb4_creset(struct pci_slot *slot) > PHBDBG(p, "CRESET: No pending transactions\n"); > > /* capp recovery */ > - if (p->flags & PHB4_CAPP_RECOVERY) > - do_capp_recovery_scoms(p); > + if (p->flags & PHB4_CAPP_RECOVERY && > + do_capp_recovery_scoms(p)) > + goto error; > > /* Clear errors in PFIR and NFIR */ > xscom_write(p->chip_id, p->pci_stk_xscom + 0x1, >
diff --git a/hw/phb4.c b/hw/phb4.c index 93ed1e06..23f923d7 100644 --- a/hw/phb4.c +++ b/hw/phb4.c @@ -2829,25 +2829,70 @@ static int64_t load_capp_ucode(struct phb4 *p) return rc; } -static void do_capp_recovery_scoms(struct phb4 *p) +static int64_t do_capp_recovery_scoms(struct phb4 *p) { - uint64_t reg; - uint32_t offset; + uint64_t rc, reg, end; + uint64_t offset = PHB4_CAPP_REG_OFFSET(p); - PHBDBG(p, "Doing CAPP recovery scoms\n"); + PHBDBG(p, "CAPP: Waiting for recovery to complete\n"); - offset = PHB4_CAPP_REG_OFFSET(p); + /* recovery timer failure period 168ms */ + end = mftb() + msecs_to_tb(168); + xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); + while ((reg & (PPC_BIT(1) | PPC_BIT(5) | PPC_BIT(9))) == 0) { + + time_wait_ms(5); + xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); + + if (end && tb_compare(mftb(), end) != TB_AAFTERB) { + PHBERR(p, "CAPP: Capp recovery Timed-out.\n"); + end = 0; + } + } + + /* Check if the recovery failed or passed */ + if (reg & PPC_BIT(1)) { + PHBDBG(p, "Doing CAPP recovery scoms\n"); /* disable snoops */ xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0); - load_capp_ucode(p); + rc = load_capp_ucode(p); + + if (rc) { + PHBERR(p, "CAPP: Unable to reload ucode\n"); + goto out; + } + /* clear err rpt reg*/ xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0); + /* clear capp fir */ xscom_write(p->chip_id, CAPP_FIR + offset, 0); + /* Just reset Bit-0,1 and dont touch any other bit */ xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, ®); reg &= ~(PPC_BIT(0) | PPC_BIT(1)); xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg); + + } else { + + /* We will checkstop here due to FIR ACTION for + * failed recovery. So this message would never be logged. + * But if we still enter here then return an error forcing a + * fence of the PHB. + */ + if (reg & PPC_BIT(5)) + PHBERR(p, "CAPP: Capp recovery Failed\n"); + else if (reg & PPC_BIT(9)) + PHBERR(p, "CAPP: Capp recovery hang detected\n"); + else + PHBERR(p, "CAPP: Unknown recovery failure\n"); + + PHBDGB("CAPP: Err/Status-reg=0x%016llx\n", reg); + rc = OPAL_HARDWARE; + } + +out: + return rc; } static int64_t phb4_creset(struct pci_slot *slot) @@ -2906,8 +2951,9 @@ static int64_t phb4_creset(struct pci_slot *slot) PHBDBG(p, "CRESET: No pending transactions\n"); /* capp recovery */ - if (p->flags & PHB4_CAPP_RECOVERY) - do_capp_recovery_scoms(p); + if (p->flags & PHB4_CAPP_RECOVERY && + do_capp_recovery_scoms(p)) + goto error; /* Clear errors in PFIR and NFIR */ xscom_write(p->chip_id, p->pci_stk_xscom + 0x1,
This patch updates do_capp_recovery_scoms() to poll the CAPP Err/Status control register, check for CAPP-Recovery to complete/fail based on indications of BITS-1,5,9 and then proceed with the CAPP-Recovery scoms iif recovery completed successfully. This would prevent cases where we bring-up the PCIe link while recovery sequencer on CAPP is still busy with casting out cache lines. In case CAPP-Recovery didn't complete successfully an error is returned from do_capp_recovery_scoms() asking phb4_creset() to keep the phb4 fenced and mark it as broken. The loop that implements polling of Err/Status register will also log an error on the PHB when it continues for more than 168ms which is the max time to failure for CAPP-Recovery. Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com> --- This patch is based on an earlier patch "capi: Perform capp recovery sequence only when PBCQ is idle" available at http://patchwork.ozlabs.org/patch/862771/ --- hw/phb4.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 8 deletions(-)