diff mbox series

[linux,dev-4.13,1/4] fsi/occ: Add retries on SBE errors

Message ID 20180518013500.18005-1-benh@kernel.crashing.org
State Superseded, archived
Headers show
Series [linux,dev-4.13,1/4] fsi/occ: Add retries on SBE errors | expand

Commit Message

Benjamin Herrenschmidt May 18, 2018, 1:34 a.m. UTC
This has proven useful in case of problems with the FSI
communication. We retry up to 3 times.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 drivers/fsi/fsi-occ.c | 179 ++++++++++++++++++++++++------------------
 1 file changed, 104 insertions(+), 75 deletions(-)

Comments

Andrew Jeffery May 21, 2018, 5:14 a.m. UTC | #1
On Fri, 18 May 2018, at 11:04, Benjamin Herrenschmidt wrote:
> This has proven useful in case of problems with the FSI
> communication. We retry up to 3 times.

As a note, there's a separate, OCC-specific error-handling sequence as well, but this is currently done in drivers/hwmon/occ/common.c.

From the OCC documentation[1]:

3.3.1 BMC-OCC Communication Failure Handling

On failures communicating with an OCC the BMC should first verify that the “OCC Active” sensor is TRUE.  If the OCCs are not active the error should be ignored and communication with the OCC should not be retired until the “OCC Active” sensor is TRUE.  If the “OCC Active” sensor is TRUE the command should be retried twice.  If the command still fails after two retries and the “OCC Active” sensor is still “TRUE” and there is no checkstop the error is valid and a request to reset the OCCs should be sent.

[1] https://github.com/open-power/docs/blob/master/occ/OCC_OpenPwr_FW_Interfaces.pdf

> 
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
>  drivers/fsi/fsi-occ.c | 179 ++++++++++++++++++++++++------------------
>  1 file changed, 104 insertions(+), 75 deletions(-)
> 
> diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
> index bdee26096688..f4b2df7a3084 100644
> --- a/drivers/fsi/fsi-occ.c
> +++ b/drivers/fsi/fsi-occ.c
> @@ -35,6 +35,7 @@
>  #define OCC_SRAM_BYTES		4096
>  #define OCC_CMD_DATA_BYTES	4090
>  #define OCC_RESP_DATA_BYTES	4089
> +#define OCC_COMMAND_RETRIES	3
>  
>  /*
>   * Assume we don't have FFDC, if we do we'll overflow and
> @@ -458,9 +459,9 @@ static int occ_getsram(struct device *sbefifo, u32 
> address, u8 *data,
>  		       ssize_t len)
>  {
>  	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
> -	size_t resp_len, resp_data_len;
> +	size_t saved_resp_len, resp_len, resp_data_len;
>  	__be32 *resp, cmd[5];
> -	int rc;
> +	int rc, retries = OCC_COMMAND_RETRIES;
>  
>  	/*
>  	 * Magic sequence to do SBE getsram command. SBE will fetch data from
> @@ -472,28 +473,37 @@ static int occ_getsram(struct device *sbefifo, u32 
> address, u8 *data,
>  	cmd[3] = cpu_to_be32(address);
>  	cmd[4] = cpu_to_be32(data_len);
>  
> -	resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
> +	resp_len = saved_resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
>  	resp = kzalloc(resp_len << 2 , GFP_KERNEL);
>  	if (!resp)
> -		return -ENOMEM;
> -
> -	rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
> -	if (rc)
> -		goto free;
> -	rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
> -	if (rc)
> -		goto free;
> +		rc = -ENOMEM;
> +
> +	rc = -1;
> +	while (rc && retries--) {
> +		resp_len = saved_resp_len;
> +		rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
> +		if (rc == 0)
> +			rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
> +		if (rc) {
> +			if (rc < 0)
> +				pr_err("occ: FSI error %d, retrying sram read\n", rc);
> +			else
> +				pr_err("occ: SBE error 0x%08x, retrying sram read\n", rc);
> +		}
> +	}
>  
> -	resp_data_len = be32_to_cpu(resp[resp_len - 1]);
> -	if (resp_data_len != data_len) {
> -		pr_err("occ: SRAM read expected %d bytes got %d\n",
> -		       data_len, resp_data_len);
> -		rc = -EBADMSG;
> -	} else {
> -		memcpy(data, resp, len);
> +	/* Check response lenght and copy data */
> +	if (rc == 0) {
> +		resp_data_len = be32_to_cpu(resp[resp_len - 1]);
> +		if (resp_data_len != data_len) {
> +			pr_err("occ: SRAM read expected %d bytes got %d\n",
> +			       data_len, resp_data_len);
> +			rc = -EBADMSG;
> +		} else {
> +			memcpy(data, resp, len);
> +		}
>  	}
>  
> -free:
>  	/* Convert positive SBEI status */
>  	if (rc > 0) {
>  		pr_err("occ: SRAM read returned failure status: %08x\n", rc);
> @@ -508,8 +518,8 @@ static int occ_putsram(struct device *sbefifo, u32 
> address, u8 *data,
>  {
>  	size_t cmd_len, buf_len, resp_len, resp_data_len;
>  	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
> +	int rc, retries = OCC_COMMAND_RETRIES;
>  	__be32 *buf;
> -	int rc;
>  
>  	/*
>  	 * We use the same buffer for command and response, make
> @@ -522,38 +532,48 @@ static int occ_putsram(struct device *sbefifo, u32 
> address, u8 *data,
>  	if (!buf)
>  		return -ENOMEM;
>  
> -	/*
> -	 * Magic sequence to do SBE putsram command. SBE will transfer
> -	 * data to specified SRAM address.
> -	 */
> -	buf[0] = cpu_to_be32(cmd_len);
> -	buf[1] = cpu_to_be32(0xa404);
> -	buf[2] = cpu_to_be32(1);
> -	buf[3] = cpu_to_be32(address);
> -	buf[4] = cpu_to_be32(data_len);
> -
> -	memcpy(&buf[5], data, len);
> -
> -	rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
> -	if (rc)
> -		goto free;
> -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> -	if (rc)
> -		goto free;
> +	rc = -1;
> +	while (rc && retries--) {
> +		/*
> +		 * Magic sequence to do SBE putsram command. SBE will transfer
> +		 * data to specified SRAM address.
> +		 */
> +		buf[0] = cpu_to_be32(cmd_len);
> +		buf[1] = cpu_to_be32(0xa404);
> +		buf[2] = cpu_to_be32(1);
> +		buf[3] = cpu_to_be32(address);
> +		buf[4] = cpu_to_be32(data_len);

Maybe it's worth a comment that we're doing this work inside the loop because we're reusing the buffer for the response, and we're doing that to minimise the memory consumption, and it motivated by the SBEFIFO protocol crazy.

> +
> +		memcpy(&buf[5], data, len);
> +
> +		resp_len = OCC_SBE_STATUS_WORDS;
> +		rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
> +		if (rc == 0)
> +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> +		if (rc) {
> +			if (rc < 0)
> +				pr_err("occ: FSI error %d, retrying sram write\n", rc);
> +			else
> +				pr_err("occ: SBE error 0x%08x, retrying sram write\n", rc);
> +		}
> +	}
>  
> -	if (resp_len != 1) {
> -		pr_err("occ: SRAM write response lenght invalid: %d\n",
> -		       resp_len);
> -		rc = -EBADMSG;
> -	} else {
> -		resp_data_len = be32_to_cpu(buf[0]);
> -		if (resp_data_len != data_len) {
> -			pr_err("occ: SRAM write expected %d bytes got %d\n",
> -			       data_len, resp_data_len);
> +	/* Check response lenght */
> +	if (rc == 0) {
> +		if (resp_len != 1) {
> +			pr_err("occ: SRAM write response lenght invalid: %d\n",

Hmm, not your fault but I just noticed 'length' is misspelled here.

> +			       resp_len);
>  			rc = -EBADMSG;
> +		} else {
> +			resp_data_len = be32_to_cpu(buf[0]);
> +			if (resp_data_len != data_len) {
> +				pr_err("occ: SRAM write expected %d bytes got %d\n",
> +				       data_len, resp_data_len);
> +				rc = -EBADMSG;
> +			}
>  		}
>  	}
> -free:
> +
>  	/* Convert positive SBEI status */
>  	if (rc > 0) {
>  		pr_err("occ: SRAM write returned failure status: %08x\n", rc);
> @@ -567,46 +587,55 @@ static int occ_trigger_attn(struct device *sbefifo)
>  {
>  	__be32 buf[OCC_SBE_STATUS_WORDS];
>  	size_t resp_len, resp_data_len;
> -	int rc;
> +	int rc, retries = OCC_COMMAND_RETRIES;
>  
>  	BUILD_BUG_ON(OCC_SBE_STATUS_WORDS < 7);
> -	resp_len = OCC_SBE_STATUS_WORDS;
>  
> -	buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
> -	buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
> -	buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
> -	buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
> -	buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
> -	buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
> -	buf[6] = 0;
> -
> -	rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
> -	if (rc)
> -		goto error;
> -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> -	if (rc)
> -		goto error;
> +	rc = -1;
> +	while (rc && retries--) {
> +		resp_len = OCC_SBE_STATUS_WORDS;
> +
> +		buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
> +		buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
> +		buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
> +		buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
> +		buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
> +		buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
> +		buf[6] = 0;
> +
> +		rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
> +		if (rc == 0)
> +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> +		if (rc) {
> +			if (rc < 0)
> +				pr_err("occ: FSI error %d, retrying attn\n", rc);
> +			else
> +				pr_err("occ: SBE error 0x%08x, retrying attn\n", rc);
> +		}
> +	}
>  
> -	if (resp_len != 1) {
> -		pr_err("occ: SRAM attn response lenght invalid: %d\n",
> -		       resp_len);
> -		rc = -EBADMSG;
> -	} else {
> -		resp_data_len = be32_to_cpu(buf[0]);
> -		if (resp_data_len != 8) {
> -			pr_err("occ: SRAM attn expected 8 bytes got %d\n",
> -			       resp_data_len);
> +	/* Check response lenght */

'length'

> +	if (rc == 0) {
> +		if (resp_len != 1) {
> +			pr_err("occ: SRAM attn response lenght invalid: %d\n",

'length'

> +			       resp_len);
>  			rc = -EBADMSG;
> +		} else {
> +			resp_data_len = be32_to_cpu(buf[0]);
> +			if (resp_data_len != 8) {
> +				pr_err("occ: SRAM attn expected 8 bytes got %d\n",
> +				       resp_data_len);
> +				rc = -EBADMSG;
> +			}
>  		}
>  	}
> - error:
> +
>  	/* Convert positive SBEI status */
>  	if (rc > 0) {
>  		pr_err("occ: SRAM attn returned failure status: %08x\n", rc);
>  		rc = -EBADMSG;
>  	}
>  
> -
>  	return rc;
>  }
>  
> -- 
> 2.17.0
>
Benjamin Herrenschmidt May 21, 2018, 8:33 a.m. UTC | #2
On Mon, 2018-05-21 at 14:44 +0930, Andrew Jeffery wrote:
> On Fri, 18 May 2018, at 11:04, Benjamin Herrenschmidt wrote:
> > This has proven useful in case of problems with the FSI
> > communication. We retry up to 3 times.
> 
> As a note, there's a separate, OCC-specific error-handling sequence
> as well, but this is currently done in drivers/hwmon/occ/common.c.

This is more for transport failures. It might be less useful now that
I've added CRC error recovery in the FSI layer and made it more
reliable overall. Experience showed that the upper OCC layer just
didn't deal with it well.

> > From the OCC documentation[1]:
> 
> 3.3.1 BMC-OCC Communication Failure Handling
> 
> On failures communicating with an OCC the BMC should first verify
> that the “OCC Active” sensor is TRUE.  If the OCCs are not active the
> error should be ignored and communication with the OCC should not be
> retired until the “OCC Active” sensor is TRUE.  If the “OCC Active”
> sensor is TRUE the command should be retried twice. 

What is the "OCC Active sensor" ?

>  If the command still fails after two retries and the “OCC Active”
> sensor is still “TRUE” and there is no checkstop the error is valid
> and a request to reset the OCCs should be sent.
> 
> [1] https://github.com/open-power/docs/blob/master/occ/OCC_OpenPwr_FW_Interfaces.pdf
> 
> > 
> > Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> > ---
> >  drivers/fsi/fsi-occ.c | 179 ++++++++++++++++++++++++------------------
> >  1 file changed, 104 insertions(+), 75 deletions(-)
> > 
> > diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
> > index bdee26096688..f4b2df7a3084 100644
> > --- a/drivers/fsi/fsi-occ.c
> > +++ b/drivers/fsi/fsi-occ.c
> > @@ -35,6 +35,7 @@
> >  #define OCC_SRAM_BYTES		4096
> >  #define OCC_CMD_DATA_BYTES	4090
> >  #define OCC_RESP_DATA_BYTES	4089
> > +#define OCC_COMMAND_RETRIES	3
> >  
> >  /*
> >   * Assume we don't have FFDC, if we do we'll overflow and
> > @@ -458,9 +459,9 @@ static int occ_getsram(struct device *sbefifo, u32 
> > address, u8 *data,
> >  		       ssize_t len)
> >  {
> >  	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
> > -	size_t resp_len, resp_data_len;
> > +	size_t saved_resp_len, resp_len, resp_data_len;
> >  	__be32 *resp, cmd[5];
> > -	int rc;
> > +	int rc, retries = OCC_COMMAND_RETRIES;
> >  
> >  	/*
> >  	 * Magic sequence to do SBE getsram command. SBE will fetch data from
> > @@ -472,28 +473,37 @@ static int occ_getsram(struct device *sbefifo, u32 
> > address, u8 *data,
> >  	cmd[3] = cpu_to_be32(address);
> >  	cmd[4] = cpu_to_be32(data_len);
> >  
> > -	resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
> > +	resp_len = saved_resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
> >  	resp = kzalloc(resp_len << 2 , GFP_KERNEL);
> >  	if (!resp)
> > -		return -ENOMEM;
> > -
> > -	rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
> > -	if (rc)
> > -		goto free;
> > -	rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
> > -	if (rc)
> > -		goto free;
> > +		rc = -ENOMEM;
> > +
> > +	rc = -1;
> > +	while (rc && retries--) {
> > +		resp_len = saved_resp_len;
> > +		rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
> > +		if (rc == 0)
> > +			rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
> > +		if (rc) {
> > +			if (rc < 0)
> > +				pr_err("occ: FSI error %d, retrying sram read\n", rc);
> > +			else
> > +				pr_err("occ: SBE error 0x%08x, retrying sram read\n", rc);
> > +		}
> > +	}
> >  
> > -	resp_data_len = be32_to_cpu(resp[resp_len - 1]);
> > -	if (resp_data_len != data_len) {
> > -		pr_err("occ: SRAM read expected %d bytes got %d\n",
> > -		       data_len, resp_data_len);
> > -		rc = -EBADMSG;
> > -	} else {
> > -		memcpy(data, resp, len);
> > +	/* Check response lenght and copy data */
> > +	if (rc == 0) {
> > +		resp_data_len = be32_to_cpu(resp[resp_len - 1]);
> > +		if (resp_data_len != data_len) {
> > +			pr_err("occ: SRAM read expected %d bytes got %d\n",
> > +			       data_len, resp_data_len);
> > +			rc = -EBADMSG;
> > +		} else {
> > +			memcpy(data, resp, len);
> > +		}
> >  	}
> >  
> > -free:
> >  	/* Convert positive SBEI status */
> >  	if (rc > 0) {
> >  		pr_err("occ: SRAM read returned failure status: %08x\n", rc);
> > @@ -508,8 +518,8 @@ static int occ_putsram(struct device *sbefifo, u32 
> > address, u8 *data,
> >  {
> >  	size_t cmd_len, buf_len, resp_len, resp_data_len;
> >  	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
> > +	int rc, retries = OCC_COMMAND_RETRIES;
> >  	__be32 *buf;
> > -	int rc;
> >  
> >  	/*
> >  	 * We use the same buffer for command and response, make
> > @@ -522,38 +532,48 @@ static int occ_putsram(struct device *sbefifo, u32 
> > address, u8 *data,
> >  	if (!buf)
> >  		return -ENOMEM;
> >  
> > -	/*
> > -	 * Magic sequence to do SBE putsram command. SBE will transfer
> > -	 * data to specified SRAM address.
> > -	 */
> > -	buf[0] = cpu_to_be32(cmd_len);
> > -	buf[1] = cpu_to_be32(0xa404);
> > -	buf[2] = cpu_to_be32(1);
> > -	buf[3] = cpu_to_be32(address);
> > -	buf[4] = cpu_to_be32(data_len);
> > -
> > -	memcpy(&buf[5], data, len);
> > -
> > -	rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
> > -	if (rc)
> > -		goto free;
> > -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> > -	if (rc)
> > -		goto free;
> > +	rc = -1;
> > +	while (rc && retries--) {
> > +		/*
> > +		 * Magic sequence to do SBE putsram command. SBE will transfer
> > +		 * data to specified SRAM address.
> > +		 */
> > +		buf[0] = cpu_to_be32(cmd_len);
> > +		buf[1] = cpu_to_be32(0xa404);
> > +		buf[2] = cpu_to_be32(1);
> > +		buf[3] = cpu_to_be32(address);
> > +		buf[4] = cpu_to_be32(data_len);
> 
> Maybe it's worth a comment that we're doing this work inside the loop because we're reusing the buffer for the response, and we're doing that to minimise the memory consumption, and it motivated by the SBEFIFO protocol crazy.
> 
> > +
> > +		memcpy(&buf[5], data, len);
> > +
> > +		resp_len = OCC_SBE_STATUS_WORDS;
> > +		rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
> > +		if (rc == 0)
> > +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> > +		if (rc) {
> > +			if (rc < 0)
> > +				pr_err("occ: FSI error %d, retrying sram write\n", rc);
> > +			else
> > +				pr_err("occ: SBE error 0x%08x, retrying sram write\n", rc);
> > +		}
> > +	}
> >  
> > -	if (resp_len != 1) {
> > -		pr_err("occ: SRAM write response lenght invalid: %d\n",
> > -		       resp_len);
> > -		rc = -EBADMSG;
> > -	} else {
> > -		resp_data_len = be32_to_cpu(buf[0]);
> > -		if (resp_data_len != data_len) {
> > -			pr_err("occ: SRAM write expected %d bytes got %d\n",
> > -			       data_len, resp_data_len);
> > +	/* Check response lenght */
> > +	if (rc == 0) {
> > +		if (resp_len != 1) {
> > +			pr_err("occ: SRAM write response lenght invalid: %d\n",
> 
> Hmm, not your fault but I just noticed 'length' is misspelled here.
> 
> > +			       resp_len);
> >  			rc = -EBADMSG;
> > +		} else {
> > +			resp_data_len = be32_to_cpu(buf[0]);
> > +			if (resp_data_len != data_len) {
> > +				pr_err("occ: SRAM write expected %d bytes got %d\n",
> > +				       data_len, resp_data_len);
> > +				rc = -EBADMSG;
> > +			}
> >  		}
> >  	}
> > -free:
> > +
> >  	/* Convert positive SBEI status */
> >  	if (rc > 0) {
> >  		pr_err("occ: SRAM write returned failure status: %08x\n", rc);
> > @@ -567,46 +587,55 @@ static int occ_trigger_attn(struct device *sbefifo)
> >  {
> >  	__be32 buf[OCC_SBE_STATUS_WORDS];
> >  	size_t resp_len, resp_data_len;
> > -	int rc;
> > +	int rc, retries = OCC_COMMAND_RETRIES;
> >  
> >  	BUILD_BUG_ON(OCC_SBE_STATUS_WORDS < 7);
> > -	resp_len = OCC_SBE_STATUS_WORDS;
> >  
> > -	buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
> > -	buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
> > -	buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
> > -	buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
> > -	buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
> > -	buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
> > -	buf[6] = 0;
> > -
> > -	rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
> > -	if (rc)
> > -		goto error;
> > -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> > -	if (rc)
> > -		goto error;
> > +	rc = -1;
> > +	while (rc && retries--) {
> > +		resp_len = OCC_SBE_STATUS_WORDS;
> > +
> > +		buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
> > +		buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
> > +		buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
> > +		buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
> > +		buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
> > +		buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
> > +		buf[6] = 0;
> > +
> > +		rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
> > +		if (rc == 0)
> > +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
> > +		if (rc) {
> > +			if (rc < 0)
> > +				pr_err("occ: FSI error %d, retrying attn\n", rc);
> > +			else
> > +				pr_err("occ: SBE error 0x%08x, retrying attn\n", rc);
> > +		}
> > +	}
> >  
> > -	if (resp_len != 1) {
> > -		pr_err("occ: SRAM attn response lenght invalid: %d\n",
> > -		       resp_len);
> > -		rc = -EBADMSG;
> > -	} else {
> > -		resp_data_len = be32_to_cpu(buf[0]);
> > -		if (resp_data_len != 8) {
> > -			pr_err("occ: SRAM attn expected 8 bytes got %d\n",
> > -			       resp_data_len);
> > +	/* Check response lenght */
> 
> 'length'
> 
> > +	if (rc == 0) {
> > +		if (resp_len != 1) {
> > +			pr_err("occ: SRAM attn response lenght invalid: %d\n",
> 
> 'length'
> 
> > +			       resp_len);
> >  			rc = -EBADMSG;
> > +		} else {
> > +			resp_data_len = be32_to_cpu(buf[0]);
> > +			if (resp_data_len != 8) {
> > +				pr_err("occ: SRAM attn expected 8 bytes got %d\n",
> > +				       resp_data_len);
> > +				rc = -EBADMSG;
> > +			}
> >  		}
> >  	}
> > - error:
> > +
> >  	/* Convert positive SBEI status */
> >  	if (rc > 0) {
> >  		pr_err("occ: SRAM attn returned failure status: %08x\n", rc);
> >  		rc = -EBADMSG;
> >  	}
> >  
> > -
> >  	return rc;
> >  }
> >  
> > -- 
> > 2.17.0
> >
Eddie James May 21, 2018, 6:48 p.m. UTC | #3
On 05/21/2018 03:33 AM, Benjamin Herrenschmidt wrote:
> On Mon, 2018-05-21 at 14:44 +0930, Andrew Jeffery wrote:
>> On Fri, 18 May 2018, at 11:04, Benjamin Herrenschmidt wrote:
>>> This has proven useful in case of problems with the FSI
>>> communication. We retry up to 3 times.
>> As a note, there's a separate, OCC-specific error-handling sequence
>> as well, but this is currently done in drivers/hwmon/occ/common.c.
> This is more for transport failures. It might be less useful now that
> I've added CRC error recovery in the FSI layer and made it more
> reliable overall. Experience showed that the upper OCC layer just
> didn't deal with it well.
>
>>>  From the OCC documentation[1]:
>> 3.3.1 BMC-OCC Communication Failure Handling
>>
>> On failures communicating with an OCC the BMC should first verify
>> that the “OCC Active” sensor is TRUE.  If the OCCs are not active the
>> error should be ignored and communication with the OCC should not be
>> retired until the “OCC Active” sensor is TRUE.  If the “OCC Active”
>> sensor is TRUE the command should be retried twice.
> What is the "OCC Active sensor" ?

It's a value in the OCC poll response.

>
>>   If the command still fails after two retries and the “OCC Active”
>> sensor is still “TRUE” and there is no checkstop the error is valid
>> and a request to reset the OCCs should be sent.
>>
>> [1] https://github.com/open-power/docs/blob/master/occ/OCC_OpenPwr_FW_Interfaces.pdf
>>
>>> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>>> ---
>>>   drivers/fsi/fsi-occ.c | 179 ++++++++++++++++++++++++------------------
>>>   1 file changed, 104 insertions(+), 75 deletions(-)
>>>
>>> diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
>>> index bdee26096688..f4b2df7a3084 100644
>>> --- a/drivers/fsi/fsi-occ.c
>>> +++ b/drivers/fsi/fsi-occ.c
>>> @@ -35,6 +35,7 @@
>>>   #define OCC_SRAM_BYTES		4096
>>>   #define OCC_CMD_DATA_BYTES	4090
>>>   #define OCC_RESP_DATA_BYTES	4089
>>> +#define OCC_COMMAND_RETRIES	3
>>>   
>>>   /*
>>>    * Assume we don't have FFDC, if we do we'll overflow and
>>> @@ -458,9 +459,9 @@ static int occ_getsram(struct device *sbefifo, u32
>>> address, u8 *data,
>>>   		       ssize_t len)
>>>   {
>>>   	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
>>> -	size_t resp_len, resp_data_len;
>>> +	size_t saved_resp_len, resp_len, resp_data_len;
>>>   	__be32 *resp, cmd[5];
>>> -	int rc;
>>> +	int rc, retries = OCC_COMMAND_RETRIES;
>>>   
>>>   	/*
>>>   	 * Magic sequence to do SBE getsram command. SBE will fetch data from
>>> @@ -472,28 +473,37 @@ static int occ_getsram(struct device *sbefifo, u32
>>> address, u8 *data,
>>>   	cmd[3] = cpu_to_be32(address);
>>>   	cmd[4] = cpu_to_be32(data_len);
>>>   
>>> -	resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
>>> +	resp_len = saved_resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
>>>   	resp = kzalloc(resp_len << 2 , GFP_KERNEL);
>>>   	if (!resp)
>>> -		return -ENOMEM;
>>> -
>>> -	rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
>>> -	if (rc)
>>> -		goto free;
>>> -	rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
>>> -	if (rc)
>>> -		goto free;
>>> +		rc = -ENOMEM;
>>> +
>>> +	rc = -1;
>>> +	while (rc && retries--) {
>>> +		resp_len = saved_resp_len;
>>> +		rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
>>> +		if (rc == 0)
>>> +			rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
>>> +		if (rc) {
>>> +			if (rc < 0)
>>> +				pr_err("occ: FSI error %d, retrying sram read\n", rc);
>>> +			else
>>> +				pr_err("occ: SBE error 0x%08x, retrying sram read\n", rc);
>>> +		}
>>> +	}
>>>   
>>> -	resp_data_len = be32_to_cpu(resp[resp_len - 1]);
>>> -	if (resp_data_len != data_len) {
>>> -		pr_err("occ: SRAM read expected %d bytes got %d\n",
>>> -		       data_len, resp_data_len);
>>> -		rc = -EBADMSG;
>>> -	} else {
>>> -		memcpy(data, resp, len);
>>> +	/* Check response lenght and copy data */
>>> +	if (rc == 0) {
>>> +		resp_data_len = be32_to_cpu(resp[resp_len - 1]);
>>> +		if (resp_data_len != data_len) {
>>> +			pr_err("occ: SRAM read expected %d bytes got %d\n",
>>> +			       data_len, resp_data_len);
>>> +			rc = -EBADMSG;
>>> +		} else {
>>> +			memcpy(data, resp, len);
>>> +		}
>>>   	}
>>>   
>>> -free:
>>>   	/* Convert positive SBEI status */
>>>   	if (rc > 0) {
>>>   		pr_err("occ: SRAM read returned failure status: %08x\n", rc);
>>> @@ -508,8 +518,8 @@ static int occ_putsram(struct device *sbefifo, u32
>>> address, u8 *data,
>>>   {
>>>   	size_t cmd_len, buf_len, resp_len, resp_data_len;
>>>   	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
>>> +	int rc, retries = OCC_COMMAND_RETRIES;
>>>   	__be32 *buf;
>>> -	int rc;
>>>   
>>>   	/*
>>>   	 * We use the same buffer for command and response, make
>>> @@ -522,38 +532,48 @@ static int occ_putsram(struct device *sbefifo, u32
>>> address, u8 *data,
>>>   	if (!buf)
>>>   		return -ENOMEM;
>>>   
>>> -	/*
>>> -	 * Magic sequence to do SBE putsram command. SBE will transfer
>>> -	 * data to specified SRAM address.
>>> -	 */
>>> -	buf[0] = cpu_to_be32(cmd_len);
>>> -	buf[1] = cpu_to_be32(0xa404);
>>> -	buf[2] = cpu_to_be32(1);
>>> -	buf[3] = cpu_to_be32(address);
>>> -	buf[4] = cpu_to_be32(data_len);
>>> -
>>> -	memcpy(&buf[5], data, len);
>>> -
>>> -	rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
>>> -	if (rc)
>>> -		goto free;
>>> -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
>>> -	if (rc)
>>> -		goto free;
>>> +	rc = -1;
>>> +	while (rc && retries--) {
>>> +		/*
>>> +		 * Magic sequence to do SBE putsram command. SBE will transfer
>>> +		 * data to specified SRAM address.
>>> +		 */
>>> +		buf[0] = cpu_to_be32(cmd_len);
>>> +		buf[1] = cpu_to_be32(0xa404);
>>> +		buf[2] = cpu_to_be32(1);
>>> +		buf[3] = cpu_to_be32(address);
>>> +		buf[4] = cpu_to_be32(data_len);
>> Maybe it's worth a comment that we're doing this work inside the loop because we're reusing the buffer for the response, and we're doing that to minimise the memory consumption, and it motivated by the SBEFIFO protocol crazy.
>>
>>> +
>>> +		memcpy(&buf[5], data, len);
>>> +
>>> +		resp_len = OCC_SBE_STATUS_WORDS;
>>> +		rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
>>> +		if (rc == 0)
>>> +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
>>> +		if (rc) {
>>> +			if (rc < 0)
>>> +				pr_err("occ: FSI error %d, retrying sram write\n", rc);
>>> +			else
>>> +				pr_err("occ: SBE error 0x%08x, retrying sram write\n", rc);
>>> +		}
>>> +	}
>>>   
>>> -	if (resp_len != 1) {
>>> -		pr_err("occ: SRAM write response lenght invalid: %d\n",
>>> -		       resp_len);
>>> -		rc = -EBADMSG;
>>> -	} else {
>>> -		resp_data_len = be32_to_cpu(buf[0]);
>>> -		if (resp_data_len != data_len) {
>>> -			pr_err("occ: SRAM write expected %d bytes got %d\n",
>>> -			       data_len, resp_data_len);
>>> +	/* Check response lenght */
>>> +	if (rc == 0) {
>>> +		if (resp_len != 1) {
>>> +			pr_err("occ: SRAM write response lenght invalid: %d\n",
>> Hmm, not your fault but I just noticed 'length' is misspelled here.
>>
>>> +			       resp_len);
>>>   			rc = -EBADMSG;
>>> +		} else {
>>> +			resp_data_len = be32_to_cpu(buf[0]);
>>> +			if (resp_data_len != data_len) {
>>> +				pr_err("occ: SRAM write expected %d bytes got %d\n",
>>> +				       data_len, resp_data_len);
>>> +				rc = -EBADMSG;
>>> +			}
>>>   		}
>>>   	}
>>> -free:
>>> +
>>>   	/* Convert positive SBEI status */
>>>   	if (rc > 0) {
>>>   		pr_err("occ: SRAM write returned failure status: %08x\n", rc);
>>> @@ -567,46 +587,55 @@ static int occ_trigger_attn(struct device *sbefifo)
>>>   {
>>>   	__be32 buf[OCC_SBE_STATUS_WORDS];
>>>   	size_t resp_len, resp_data_len;
>>> -	int rc;
>>> +	int rc, retries = OCC_COMMAND_RETRIES;
>>>   
>>>   	BUILD_BUG_ON(OCC_SBE_STATUS_WORDS < 7);
>>> -	resp_len = OCC_SBE_STATUS_WORDS;
>>>   
>>> -	buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
>>> -	buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
>>> -	buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
>>> -	buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
>>> -	buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
>>> -	buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
>>> -	buf[6] = 0;
>>> -
>>> -	rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
>>> -	if (rc)
>>> -		goto error;
>>> -	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
>>> -	if (rc)
>>> -		goto error;
>>> +	rc = -1;
>>> +	while (rc && retries--) {
>>> +		resp_len = OCC_SBE_STATUS_WORDS;
>>> +
>>> +		buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
>>> +		buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
>>> +		buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
>>> +		buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
>>> +		buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
>>> +		buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
>>> +		buf[6] = 0;
>>> +
>>> +		rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
>>> +		if (rc == 0)
>>> +			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
>>> +		if (rc) {
>>> +			if (rc < 0)
>>> +				pr_err("occ: FSI error %d, retrying attn\n", rc);
>>> +			else
>>> +				pr_err("occ: SBE error 0x%08x, retrying attn\n", rc);
>>> +		}
>>> +	}
>>>   
>>> -	if (resp_len != 1) {
>>> -		pr_err("occ: SRAM attn response lenght invalid: %d\n",
>>> -		       resp_len);
>>> -		rc = -EBADMSG;
>>> -	} else {
>>> -		resp_data_len = be32_to_cpu(buf[0]);
>>> -		if (resp_data_len != 8) {
>>> -			pr_err("occ: SRAM attn expected 8 bytes got %d\n",
>>> -			       resp_data_len);
>>> +	/* Check response lenght */
>> 'length'
>>
>>> +	if (rc == 0) {
>>> +		if (resp_len != 1) {
>>> +			pr_err("occ: SRAM attn response lenght invalid: %d\n",
>> 'length'
>>
>>> +			       resp_len);
>>>   			rc = -EBADMSG;
>>> +		} else {
>>> +			resp_data_len = be32_to_cpu(buf[0]);
>>> +			if (resp_data_len != 8) {
>>> +				pr_err("occ: SRAM attn expected 8 bytes got %d\n",
>>> +				       resp_data_len);
>>> +				rc = -EBADMSG;
>>> +			}
>>>   		}
>>>   	}
>>> - error:
>>> +
>>>   	/* Convert positive SBEI status */
>>>   	if (rc > 0) {
>>>   		pr_err("occ: SRAM attn returned failure status: %08x\n", rc);
>>>   		rc = -EBADMSG;
>>>   	}
>>>   
>>> -
>>>   	return rc;
>>>   }
>>>   
>>> -- 
>>> 2.17.0
>>>
Benjamin Herrenschmidt May 21, 2018, 10:53 p.m. UTC | #4
On Mon, 2018-05-21 at 13:48 -0500, Eddie James wrote:
> 
> > > 3.3.1 BMC-OCC Communication Failure Handling
> > > 
> > > On failures communicating with an OCC the BMC should first verify
> > > that the “OCC Active” sensor is TRUE.  If the OCCs are not active the
> > > error should be ignored and communication with the OCC should not be
> > > retired until the “OCC Active” sensor is TRUE.  If the “OCC Active”
> > > sensor is TRUE the command should be retried twice.
> > 
> > What is the "OCC Active sensor" ?
> 
> It's a value in the OCC poll response.

That's only useful if you can get that response then... which you can't
if the communication fails. I'm missing something here.

Ben.
Eddie James May 22, 2018, 2:09 p.m. UTC | #5
On 05/21/2018 05:53 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2018-05-21 at 13:48 -0500, Eddie James wrote:
>>>> 3.3.1 BMC-OCC Communication Failure Handling
>>>>
>>>> On failures communicating with an OCC the BMC should first verify
>>>> that the “OCC Active” sensor is TRUE.  If the OCCs are not active the
>>>> error should be ignored and communication with the OCC should not be
>>>> retired until the “OCC Active” sensor is TRUE.  If the “OCC Active”
>>>> sensor is TRUE the command should be retried twice.
>>> What is the "OCC Active sensor" ?
>> It's a value in the OCC poll response.
> That's only useful if you can get that response then... which you can't
> if the communication fails. I'm missing something here.

Ah. There is also the IPMI OCC active sensor, which is what this must 
mean. We're doing this correctly by unbinding the occ-hwmon driver when 
the OCC active sensor comes in false. So, if driver is bound, OCC active 
must be true, so we "retry" twice by only setting the error attribute 
after two failed poll responses.

Thanks... sorry for the mixup.
Eddie

>
> Ben.
>
diff mbox series

Patch

diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c
index bdee26096688..f4b2df7a3084 100644
--- a/drivers/fsi/fsi-occ.c
+++ b/drivers/fsi/fsi-occ.c
@@ -35,6 +35,7 @@ 
 #define OCC_SRAM_BYTES		4096
 #define OCC_CMD_DATA_BYTES	4090
 #define OCC_RESP_DATA_BYTES	4089
+#define OCC_COMMAND_RETRIES	3
 
 /*
  * Assume we don't have FFDC, if we do we'll overflow and
@@ -458,9 +459,9 @@  static int occ_getsram(struct device *sbefifo, u32 address, u8 *data,
 		       ssize_t len)
 {
 	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
-	size_t resp_len, resp_data_len;
+	size_t saved_resp_len, resp_len, resp_data_len;
 	__be32 *resp, cmd[5];
-	int rc;
+	int rc, retries = OCC_COMMAND_RETRIES;
 
 	/*
 	 * Magic sequence to do SBE getsram command. SBE will fetch data from
@@ -472,28 +473,37 @@  static int occ_getsram(struct device *sbefifo, u32 address, u8 *data,
 	cmd[3] = cpu_to_be32(address);
 	cmd[4] = cpu_to_be32(data_len);
 
-	resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
+	resp_len = saved_resp_len = (data_len >> 2) + OCC_SBE_STATUS_WORDS;
 	resp = kzalloc(resp_len << 2 , GFP_KERNEL);
 	if (!resp)
-		return -ENOMEM;
-
-	rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
-	if (rc)
-		goto free;
-	rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
-	if (rc)
-		goto free;
+		rc = -ENOMEM;
+
+	rc = -1;
+	while (rc && retries--) {
+		resp_len = saved_resp_len;
+		rc = sbefifo_submit(sbefifo, cmd, 5, resp, &resp_len);
+		if (rc == 0)
+			rc = sbefifo_parse_status(0xa403, resp, resp_len, &resp_len);
+		if (rc) {
+			if (rc < 0)
+				pr_err("occ: FSI error %d, retrying sram read\n", rc);
+			else
+				pr_err("occ: SBE error 0x%08x, retrying sram read\n", rc);
+		}
+	}
 
-	resp_data_len = be32_to_cpu(resp[resp_len - 1]);
-	if (resp_data_len != data_len) {
-		pr_err("occ: SRAM read expected %d bytes got %d\n",
-		       data_len, resp_data_len);
-		rc = -EBADMSG;
-	} else {
-		memcpy(data, resp, len);
+	/* Check response lenght and copy data */
+	if (rc == 0) {
+		resp_data_len = be32_to_cpu(resp[resp_len - 1]);
+		if (resp_data_len != data_len) {
+			pr_err("occ: SRAM read expected %d bytes got %d\n",
+			       data_len, resp_data_len);
+			rc = -EBADMSG;
+		} else {
+			memcpy(data, resp, len);
+		}
 	}
 
-free:
 	/* Convert positive SBEI status */
 	if (rc > 0) {
 		pr_err("occ: SRAM read returned failure status: %08x\n", rc);
@@ -508,8 +518,8 @@  static int occ_putsram(struct device *sbefifo, u32 address, u8 *data,
 {
 	size_t cmd_len, buf_len, resp_len, resp_data_len;
 	u32 data_len = ((len + 7) / 8) * 8;	/* must be multiples of 8 B */
+	int rc, retries = OCC_COMMAND_RETRIES;
 	__be32 *buf;
-	int rc;
 
 	/*
 	 * We use the same buffer for command and response, make
@@ -522,38 +532,48 @@  static int occ_putsram(struct device *sbefifo, u32 address, u8 *data,
 	if (!buf)
 		return -ENOMEM;
 
-	/*
-	 * Magic sequence to do SBE putsram command. SBE will transfer
-	 * data to specified SRAM address.
-	 */
-	buf[0] = cpu_to_be32(cmd_len);
-	buf[1] = cpu_to_be32(0xa404);
-	buf[2] = cpu_to_be32(1);
-	buf[3] = cpu_to_be32(address);
-	buf[4] = cpu_to_be32(data_len);
-
-	memcpy(&buf[5], data, len);
-
-	rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
-	if (rc)
-		goto free;
-	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
-	if (rc)
-		goto free;
+	rc = -1;
+	while (rc && retries--) {
+		/*
+		 * Magic sequence to do SBE putsram command. SBE will transfer
+		 * data to specified SRAM address.
+		 */
+		buf[0] = cpu_to_be32(cmd_len);
+		buf[1] = cpu_to_be32(0xa404);
+		buf[2] = cpu_to_be32(1);
+		buf[3] = cpu_to_be32(address);
+		buf[4] = cpu_to_be32(data_len);
+
+		memcpy(&buf[5], data, len);
+
+		resp_len = OCC_SBE_STATUS_WORDS;
+		rc = sbefifo_submit(sbefifo, buf, cmd_len, buf, &resp_len);
+		if (rc == 0)
+			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
+		if (rc) {
+			if (rc < 0)
+				pr_err("occ: FSI error %d, retrying sram write\n", rc);
+			else
+				pr_err("occ: SBE error 0x%08x, retrying sram write\n", rc);
+		}
+	}
 
-	if (resp_len != 1) {
-		pr_err("occ: SRAM write response lenght invalid: %d\n",
-		       resp_len);
-		rc = -EBADMSG;
-	} else {
-		resp_data_len = be32_to_cpu(buf[0]);
-		if (resp_data_len != data_len) {
-			pr_err("occ: SRAM write expected %d bytes got %d\n",
-			       data_len, resp_data_len);
+	/* Check response lenght */
+	if (rc == 0) {
+		if (resp_len != 1) {
+			pr_err("occ: SRAM write response lenght invalid: %d\n",
+			       resp_len);
 			rc = -EBADMSG;
+		} else {
+			resp_data_len = be32_to_cpu(buf[0]);
+			if (resp_data_len != data_len) {
+				pr_err("occ: SRAM write expected %d bytes got %d\n",
+				       data_len, resp_data_len);
+				rc = -EBADMSG;
+			}
 		}
 	}
-free:
+
 	/* Convert positive SBEI status */
 	if (rc > 0) {
 		pr_err("occ: SRAM write returned failure status: %08x\n", rc);
@@ -567,46 +587,55 @@  static int occ_trigger_attn(struct device *sbefifo)
 {
 	__be32 buf[OCC_SBE_STATUS_WORDS];
 	size_t resp_len, resp_data_len;
-	int rc;
+	int rc, retries = OCC_COMMAND_RETRIES;
 
 	BUILD_BUG_ON(OCC_SBE_STATUS_WORDS < 7);
-	resp_len = OCC_SBE_STATUS_WORDS;
 
-	buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
-	buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
-	buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
-	buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
-	buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
-	buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
-	buf[6] = 0;
-
-	rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
-	if (rc)
-		goto error;
-	rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
-	if (rc)
-		goto error;
+	rc = -1;
+	while (rc && retries--) {
+		resp_len = OCC_SBE_STATUS_WORDS;
+
+		buf[0] = cpu_to_be32(0x5 + 0x2);        /* Chip-op length in words */
+		buf[1] = cpu_to_be32(0xa404);           /* PutOCCSRAM */
+		buf[2] = cpu_to_be32(0x3);              /* Mode: Circular */
+		buf[3] = cpu_to_be32(0x0);              /* Address: ignored in mode 3 */
+		buf[4] = cpu_to_be32(0x8);              /* Data length in bytes */
+		buf[5] = cpu_to_be32(0x20010000);       /* Trigger OCC attention */
+		buf[6] = 0;
+
+		rc = sbefifo_submit(sbefifo, buf, 7, buf, &resp_len);
+		if (rc == 0)
+			rc = sbefifo_parse_status(0xa404, buf, resp_len, &resp_len);
+		if (rc) {
+			if (rc < 0)
+				pr_err("occ: FSI error %d, retrying attn\n", rc);
+			else
+				pr_err("occ: SBE error 0x%08x, retrying attn\n", rc);
+		}
+	}
 
-	if (resp_len != 1) {
-		pr_err("occ: SRAM attn response lenght invalid: %d\n",
-		       resp_len);
-		rc = -EBADMSG;
-	} else {
-		resp_data_len = be32_to_cpu(buf[0]);
-		if (resp_data_len != 8) {
-			pr_err("occ: SRAM attn expected 8 bytes got %d\n",
-			       resp_data_len);
+	/* Check response lenght */
+	if (rc == 0) {
+		if (resp_len != 1) {
+			pr_err("occ: SRAM attn response lenght invalid: %d\n",
+			       resp_len);
 			rc = -EBADMSG;
+		} else {
+			resp_data_len = be32_to_cpu(buf[0]);
+			if (resp_data_len != 8) {
+				pr_err("occ: SRAM attn expected 8 bytes got %d\n",
+				       resp_data_len);
+				rc = -EBADMSG;
+			}
 		}
 	}
- error:
+
 	/* Convert positive SBEI status */
 	if (rc > 0) {
 		pr_err("occ: SRAM attn returned failure status: %08x\n", rc);
 		rc = -EBADMSG;
 	}
 
-
 	return rc;
 }