Message ID | 151266318695.20428.3075500779619115039.stgit@jupiter.in.ibm.com |
---|---|
State | Superseded |
Headers | show |
Series | [1/2] opal/xscom: Move the delay inside xscom_reset() function. | expand |
On Thu, 07 Dec 2017 21:43:06 +0530 Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> wrote: > From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> > > Due to a hardware issue where core responding to scom was delayed due to > thread reconfiguration, leaves the SCOM logic in a state where the > subsequent scom to that core can get errors. This is affected for Core > PC scom registers in the range of 20010A80-20010ABF > > The solution is if a xscom timeout occurs to one of Core PC scom registers > in the range of 20010A80-20010ABF, a clearing scom write is done to > 0x20010800 with data of '0x00000000' which will also get a timeout but > clears the scom logic errors. After the clearing write is done the original > scom operation can be retried. > > The scom timeout is reported as status 0x4 (Invalid address) in HMER[21-23]. This looks to me like it follows the recipe for the fix. Reviewed-by: Nicholas Piggin <npiggin@gmail.com> > > Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> > --- > hw/xscom.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- > include/xscom.h | 8 ++++++ > 2 files changed, 80 insertions(+), 3 deletions(-) > > diff --git a/hw/xscom.c b/hw/xscom.c > index 2621465..2ad5114 100644 > --- a/hw/xscom.c > +++ b/hw/xscom.c > @@ -151,8 +151,64 @@ static void xscom_reset(uint32_t gcid) > */ > } > > +static int xscom_clear_error(uint32_t gcid, uint32_t pcb_addr) > +{ > + u64 hmer; > + uint32_t base_xscom_addr; > + uint32_t xscom_clear_reg = 0x20010800; > + > + /* only in case of p9 */ > + if (proc_gen != proc_gen_p9) > + return 0; > + > + /* > + * Due to a hardware issue where core responding to scom was delayed > + * due to thread reconfiguration, leaves the scom logic in a state > + * where the subsequent scom to that core can get errors. This is > + * affected for Core PC scom registers in the range of > + * 20010A80-20010ABF. > + * > + * The solution is if a xscom timeout occurs to one of Core PC scom > + * registers in the range of 20010A80-20010ABF, a clearing scom > + * write is done to 0x20010800 with data of '0x00000000' which will > + * also get a timeout but clears the scom logic errors. After the > + * clearing write is done the original scom operation can be retried. > + * > + * The scom timeout is reported as status 0x4 (Invalid address) > + * in HMER[21-23]. > + */ > + > + base_xscom_addr = pcb_addr & XSCOM_CLEAR_RANGE_MASK; > + if (!((base_xscom_addr >= XSCOM_CLEAR_RANGE_START) && > + (base_xscom_addr <= XSCOM_CLEAR_RANGE_END))) > + return 0; > + > + /* Reset the XSCOM or next scom operation will fail. */ > + xscom_reset(gcid); > + > + /* Clear errors in HMER */ > + mtspr(SPR_HMER, HMER_CLR_MASK); > + > + /* Write 0 to clear the xscom logic errors on target chip */ > + out_be64(xscom_addr(gcid, xscom_clear_reg), 0); > + hmer = xscom_wait_done(); > + > + /* > + * Above clearing xscom write will timeout and error out with > + * invalid access as there is no register at that address. This > + * xscom operation just helps to clear the xscom logic error. > + * > + * On failure, reset the XSCOM or we'll hang on the next access > + */ > + if (hmer & SPR_HMER_XSCOM_FAIL) > + xscom_reset(gcid); > + > + return 1; > +} > + > static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr, > - bool is_write, int64_t retries) > + bool is_write, int64_t retries, > + int64_t *xscom_clear_retries) > { > unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer); > int64_t rc = OPAL_HARDWARE; > @@ -191,6 +247,15 @@ static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_add > break; > case 4: /* Invalid address / address error */ > rc = OPAL_XSCOM_ADDR_ERROR; > + if (xscom_clear_error(gcid, pcb_addr)) { > + /* return busy if retries still pending. */ > + if ((*xscom_clear_retries)--) > + return OPAL_XSCOM_BUSY; > + > + prlog(PR_DEBUG, "XSCOM: error recovery failed for " > + "gcid=0x%x pcb_addr=0x%x\n", gcid, pcb_addr); > + > + } > break; > case 5: /* Clock error */ > rc = OPAL_XSCOM_CLOCK_ERROR; > @@ -253,6 +318,7 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) > { > uint64_t hmer; > int64_t ret, retries; > + int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES; > > if (!xscom_gcid_ok(gcid)) { > prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); > @@ -276,7 +342,8 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) > return OPAL_SUCCESS; > > /* Handle error and possibly eventually retry */ > - ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries); > + ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries, > + &xscom_clear_retries); > if (ret != OPAL_BUSY) > break; > } > @@ -303,6 +370,7 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val) > { > uint64_t hmer; > int64_t ret, retries = 0; > + int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES; > > if (!xscom_gcid_ok(gcid)) { > prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); > @@ -326,7 +394,8 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val) > return OPAL_SUCCESS; > > /* Handle error and possibly eventually retry */ > - ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries); > + ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries, > + &xscom_clear_retries); > if (ret != OPAL_BUSY) > break; > } > diff --git a/include/xscom.h b/include/xscom.h > index 5a5d0b9..3a1374c 100644 > --- a/include/xscom.h > +++ b/include/xscom.h > @@ -206,6 +206,14 @@ > /* Max number of retries when XSCOM remains busy */ > #define XSCOM_BUSY_MAX_RETRIES 3000 > > +/* Max number of retries after xscom clearing is done */ > +#define XSCOM_CLEAR_MAX_RETRIES 3 > + > +/* xscom clear address range/mask */ > +#define XSCOM_CLEAR_RANGE_START 0x20010A00 > +#define XSCOM_CLEAR_RANGE_END 0x20010ABF > +#define XSCOM_CLEAR_RANGE_MASK 0x200FFBFF > + > /* Retry count after which to reset XSCOM, if still busy */ > #define XSCOM_BUSY_RESET_THRESHOLD 1000 > > > _______________________________________________ > Skiboot mailing list > Skiboot@lists.ozlabs.org > https://lists.ozlabs.org/listinfo/skiboot
diff --git a/hw/xscom.c b/hw/xscom.c index 2621465..2ad5114 100644 --- a/hw/xscom.c +++ b/hw/xscom.c @@ -151,8 +151,64 @@ static void xscom_reset(uint32_t gcid) */ } +static int xscom_clear_error(uint32_t gcid, uint32_t pcb_addr) +{ + u64 hmer; + uint32_t base_xscom_addr; + uint32_t xscom_clear_reg = 0x20010800; + + /* only in case of p9 */ + if (proc_gen != proc_gen_p9) + return 0; + + /* + * Due to a hardware issue where core responding to scom was delayed + * due to thread reconfiguration, leaves the scom logic in a state + * where the subsequent scom to that core can get errors. This is + * affected for Core PC scom registers in the range of + * 20010A80-20010ABF. + * + * The solution is if a xscom timeout occurs to one of Core PC scom + * registers in the range of 20010A80-20010ABF, a clearing scom + * write is done to 0x20010800 with data of '0x00000000' which will + * also get a timeout but clears the scom logic errors. After the + * clearing write is done the original scom operation can be retried. + * + * The scom timeout is reported as status 0x4 (Invalid address) + * in HMER[21-23]. + */ + + base_xscom_addr = pcb_addr & XSCOM_CLEAR_RANGE_MASK; + if (!((base_xscom_addr >= XSCOM_CLEAR_RANGE_START) && + (base_xscom_addr <= XSCOM_CLEAR_RANGE_END))) + return 0; + + /* Reset the XSCOM or next scom operation will fail. */ + xscom_reset(gcid); + + /* Clear errors in HMER */ + mtspr(SPR_HMER, HMER_CLR_MASK); + + /* Write 0 to clear the xscom logic errors on target chip */ + out_be64(xscom_addr(gcid, xscom_clear_reg), 0); + hmer = xscom_wait_done(); + + /* + * Above clearing xscom write will timeout and error out with + * invalid access as there is no register at that address. This + * xscom operation just helps to clear the xscom logic error. + * + * On failure, reset the XSCOM or we'll hang on the next access + */ + if (hmer & SPR_HMER_XSCOM_FAIL) + xscom_reset(gcid); + + return 1; +} + static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr, - bool is_write, int64_t retries) + bool is_write, int64_t retries, + int64_t *xscom_clear_retries) { unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer); int64_t rc = OPAL_HARDWARE; @@ -191,6 +247,15 @@ static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_add break; case 4: /* Invalid address / address error */ rc = OPAL_XSCOM_ADDR_ERROR; + if (xscom_clear_error(gcid, pcb_addr)) { + /* return busy if retries still pending. */ + if ((*xscom_clear_retries)--) + return OPAL_XSCOM_BUSY; + + prlog(PR_DEBUG, "XSCOM: error recovery failed for " + "gcid=0x%x pcb_addr=0x%x\n", gcid, pcb_addr); + + } break; case 5: /* Clock error */ rc = OPAL_XSCOM_CLOCK_ERROR; @@ -253,6 +318,7 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) { uint64_t hmer; int64_t ret, retries; + int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES; if (!xscom_gcid_ok(gcid)) { prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); @@ -276,7 +342,8 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) return OPAL_SUCCESS; /* Handle error and possibly eventually retry */ - ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries); + ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries, + &xscom_clear_retries); if (ret != OPAL_BUSY) break; } @@ -303,6 +370,7 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val) { uint64_t hmer; int64_t ret, retries = 0; + int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES; if (!xscom_gcid_ok(gcid)) { prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); @@ -326,7 +394,8 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val) return OPAL_SUCCESS; /* Handle error and possibly eventually retry */ - ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries); + ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries, + &xscom_clear_retries); if (ret != OPAL_BUSY) break; } diff --git a/include/xscom.h b/include/xscom.h index 5a5d0b9..3a1374c 100644 --- a/include/xscom.h +++ b/include/xscom.h @@ -206,6 +206,14 @@ /* Max number of retries when XSCOM remains busy */ #define XSCOM_BUSY_MAX_RETRIES 3000 +/* Max number of retries after xscom clearing is done */ +#define XSCOM_CLEAR_MAX_RETRIES 3 + +/* xscom clear address range/mask */ +#define XSCOM_CLEAR_RANGE_START 0x20010A00 +#define XSCOM_CLEAR_RANGE_END 0x20010ABF +#define XSCOM_CLEAR_RANGE_MASK 0x200FFBFF + /* Retry count after which to reset XSCOM, if still busy */ #define XSCOM_BUSY_RESET_THRESHOLD 1000