diff mbox series

[v2,2/2] hwmon (occ): Retry for checksum failure

Message ID 20220426154956.27205-3-eajames@linux.ibm.com
State Accepted, archived
Headers show
Series fsi and hwmon (occ): Prevent occasional checksum failures | expand

Commit Message

Eddie James April 26, 2022, 3:49 p.m. UTC
Due to the OCC communication design with a shared SRAM area,
checkum errors are expected due to corrupted buffer from OCC
communications with other system components. Therefore, retry
the command twice in the event of a checksum failure.

Signed-off-by: Eddie James <eajames@linux.ibm.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/occ/p9_sbe.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

Comments

Joel Stanley April 27, 2022, 8:34 a.m. UTC | #1
On Tue, 26 Apr 2022 at 15:50, Eddie James <eajames@linux.ibm.com> wrote:
>
> Due to the OCC communication design with a shared SRAM area,
> checkum errors are expected due to corrupted buffer from OCC
> communications with other system components. Therefore, retry
> the command twice in the event of a checksum failure.
>
> Signed-off-by: Eddie James <eajames@linux.ibm.com>
> Acked-by: Guenter Roeck <linux@roeck-us.net>
> ---
>  drivers/hwmon/occ/p9_sbe.c | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/hwmon/occ/p9_sbe.c b/drivers/hwmon/occ/p9_sbe.c
> index 49b13cc01073..e6ccef2af659 100644
> --- a/drivers/hwmon/occ/p9_sbe.c
> +++ b/drivers/hwmon/occ/p9_sbe.c
> @@ -14,6 +14,8 @@
>
>  #include "common.h"
>
> +#define OCC_CHECKSUM_RETRIES   3
> +
>  struct p9_sbe_occ {
>         struct occ occ;
>         bool sbe_error;
> @@ -83,17 +85,22 @@ static int p9_sbe_occ_send_cmd(struct occ *occ, u8 *cmd, size_t len)
>         struct occ_response *resp = &occ->resp;
>         struct p9_sbe_occ *ctx = to_p9_sbe_occ(occ);
>         size_t resp_len = sizeof(*resp);
> +       int i;
>         int rc;
>
> -       rc = fsi_occ_submit(ctx->sbe, cmd, len, resp, &resp_len);
> -       if (rc < 0) {
> +       for (i = 0; i < OCC_CHECKSUM_RETRIES; ++i) {
> +               rc = fsi_occ_submit(ctx->sbe, cmd, len, resp, &resp_len);
> +               if (rc >= 0)
> +                       break;
>                 if (resp_len) {
>                         if (p9_sbe_occ_save_ffdc(ctx, resp, resp_len))
>                                 sysfs_notify(&occ->bus_dev->kobj, NULL,
>                                              bin_attr_ffdc.attr.name);
> -               }
>
> -               return rc;
> +                       return rc;
> +               }
> +               if (rc != -EBADE)
> +                       return rc;

Future you might appreciate a comment above the EBADE check clarifying
why that error is being special cased.

>         }
>
>         switch (resp->return_status) {
> --
> 2.27.0
>
diff mbox series

Patch

diff --git a/drivers/hwmon/occ/p9_sbe.c b/drivers/hwmon/occ/p9_sbe.c
index 49b13cc01073..e6ccef2af659 100644
--- a/drivers/hwmon/occ/p9_sbe.c
+++ b/drivers/hwmon/occ/p9_sbe.c
@@ -14,6 +14,8 @@ 
 
 #include "common.h"
 
+#define OCC_CHECKSUM_RETRIES	3
+
 struct p9_sbe_occ {
 	struct occ occ;
 	bool sbe_error;
@@ -83,17 +85,22 @@  static int p9_sbe_occ_send_cmd(struct occ *occ, u8 *cmd, size_t len)
 	struct occ_response *resp = &occ->resp;
 	struct p9_sbe_occ *ctx = to_p9_sbe_occ(occ);
 	size_t resp_len = sizeof(*resp);
+	int i;
 	int rc;
 
-	rc = fsi_occ_submit(ctx->sbe, cmd, len, resp, &resp_len);
-	if (rc < 0) {
+	for (i = 0; i < OCC_CHECKSUM_RETRIES; ++i) {
+		rc = fsi_occ_submit(ctx->sbe, cmd, len, resp, &resp_len);
+		if (rc >= 0)
+			break;
 		if (resp_len) {
 			if (p9_sbe_occ_save_ffdc(ctx, resp, resp_len))
 				sysfs_notify(&occ->bus_dev->kobj, NULL,
 					     bin_attr_ffdc.attr.name);
-		}
 
-		return rc;
+			return rc;
+		}
+		if (rc != -EBADE)
+			return rc;
 	}
 
 	switch (resp->return_status) {