diff mbox series

npu2: hw-procedures: Add check_credits procedure

Message ID 1511299309-26353-1-git-send-email-arbab@linux.vnet.ibm.com
State Superseded
Headers show
Series npu2: hw-procedures: Add check_credits procedure | expand

Commit Message

Reza Arbab Nov. 21, 2017, 9:21 p.m. UTC
As an immediate mitigator for a current hardware glitch, add a procedure
that can be used to validate NTL credit values. This will be called as a
safeguard to check that link training succeeded.

Assert that things are exactly as we expect, because if they aren't, the
system will experience a catastrophic failure shortly after the start of
link traffic.

Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
---
 hw/npu2-hw-procedures.c | 39 ++++++++++++++++++++++++++++++++++++++-
 include/npu2.h          |  6 +++++-
 2 files changed, 43 insertions(+), 2 deletions(-)

Comments

Alistair Popple Nov. 21, 2017, 10:29 p.m. UTC | #1
On Tue, 21 Nov 2017 03:21:49 PM Reza Arbab wrote:
> As an immediate mitigator for a current hardware glitch, add a procedure
> that can be used to validate NTL credit values. This will be called as a
> safeguard to check that link training succeeded.
> 
> Assert that things are exactly as we expect, because if they aren't, the
> system will experience a catastrophic failure shortly after the start of
> link traffic.

I guess we could return a procedure failure which would result in the driver
load failing but not crash the whole system. However I suppose this failure mode
is much more subtle so I agree it's probably best to just fail loud and early
given this HW state clearly indicates a bug.

One comment though - can you please add this procedure to doc/nvlink.rst in this
patch? Thanks.

>
>  #define NPU2DEVDBG(p, fmt, a...)	NPU2DBG((p)->npu, fmt, ##a)
>  #define NPU2DEVINF(p, fmt, a...)	NPU2INF((p)->npu, fmt, ##a)
> -#define NPU2DEVERR(p, fmt, a...)	NPU2ERR((p)->npu, fmt, ##a)
> +#define NPU2DEVERR(p, fmt, a...)        prlog(PR_ERR, "NPU%d:%d:%d.%d " fmt, \
> +					      (p)->npu->phb.opal_id, \
> +					      ((p)->bdfn >> 8) & 0xff, \
> +					      ((p)->bdfn >> 3) & 0x1f, \
> +					      (p)->bdfn & 0x7, ##a)

Would also be nice to add this info for NPU2DEVDBG/INF.

Acked-by: Alistair Popple <alistair@popple.id.au>

>
>  /* Number of PEs supported */
>  #define NPU2_MAX_PE_NUM		16
>
diff mbox series

Patch

diff --git a/hw/npu2-hw-procedures.c b/hw/npu2-hw-procedures.c
index 1db171a..bb8534b 100644
--- a/hw/npu2-hw-procedures.c
+++ b/hw/npu2-hw-procedures.c
@@ -663,6 +663,41 @@  static uint32_t phy_rx_training_wait(struct npu2_dev *ndev)
 }
 DEFINE_PROCEDURE(phy_rx_training, phy_rx_training_wait);
 
+static uint32_t check_credit(struct npu2_dev *ndev, uint64_t reg,
+			     const char *reg_name, uint64_t expected)
+{
+	uint64_t val;
+
+	val = npu2_read(ndev->npu, reg);
+	if (val == expected)
+		return 0;
+
+	NPU2DEVERR(ndev, "%s: expected 0x%llx, read 0x%llx\n",
+		   reg_name, expected, val);
+
+	return 1;
+}
+
+#define CHECK_CREDIT(ndev, reg, expected) \
+	check_credit(ndev, reg(ndev), #reg, expected);
+
+static uint32_t check_credits(struct npu2_dev *ndev)
+{
+	int fail = 0;
+
+	fail += CHECK_CREDIT(ndev, NPU2_NTL_CRED_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL);
+	fail += CHECK_CREDIT(ndev, NPU2_NTL_RSP_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL);
+	fail += CHECK_CREDIT(ndev, NPU2_NTL_CRED_DATA_CREDIT_RX, 0x1001000000000000ULL);
+	fail += CHECK_CREDIT(ndev, NPU2_NTL_RSP_DATA_CREDIT_RX, 0x1001000000000000ULL);
+	fail += CHECK_CREDIT(ndev, NPU2_NTL_DBD_HDR_CREDIT_RX, 0x0640640000000000ULL);
+	fail += CHECK_CREDIT(ndev, NPU2_NTL_ATSD_HDR_CREDIT_RX, 0x0200200000000000ULL);
+
+	assert(!fail);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(check_credits);
+
 static struct procedure *npu_procedures[] = {
 	&procedure_stop,
 	&procedure_nop,
@@ -678,7 +713,9 @@  static struct procedure *npu_procedures[] = {
 
 	/* Place holders for pre-terminate and terminate procedures */
 	&procedure_nop,
-	&procedure_nop};
+	&procedure_nop,
+	&procedure_check_credits
+};
 
 /* Run a procedure step(s) and return status */
 static uint32_t get_procedure_status(struct npu2_dev *dev)
diff --git a/include/npu2.h b/include/npu2.h
index 3b56f48..ce68113 100644
--- a/include/npu2.h
+++ b/include/npu2.h
@@ -29,7 +29,11 @@ 
 
 #define NPU2DEVDBG(p, fmt, a...)	NPU2DBG((p)->npu, fmt, ##a)
 #define NPU2DEVINF(p, fmt, a...)	NPU2INF((p)->npu, fmt, ##a)
-#define NPU2DEVERR(p, fmt, a...)	NPU2ERR((p)->npu, fmt, ##a)
+#define NPU2DEVERR(p, fmt, a...)        prlog(PR_ERR, "NPU%d:%d:%d.%d " fmt, \
+					      (p)->npu->phb.opal_id, \
+					      ((p)->bdfn >> 8) & 0xff, \
+					      ((p)->bdfn >> 3) & 0x1f, \
+					      (p)->bdfn & 0x7, ##a)
 
 /* Number of PEs supported */
 #define NPU2_MAX_PE_NUM		16