diff mbox series

[RFC] hw/npu2: Implement logging HMI actions

Message ID 20171113101334.31026-1-bsingharora@gmail.com
State Superseded
Headers show
Series [RFC] hw/npu2: Implement logging HMI actions | expand

Commit Message

Balbir Singh Nov. 13, 2017, 10:13 a.m. UTC
Log HMI errors as a starting step. We'll need to get Linux to deduce
and interpret the HMI event. The expectation is that we'll add a
corresponding handler in HMI handling parts to deal with the various
errors that have not been unhandled.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
---
 core/hmi.c          | 84 ++++++++++++++++++++++++++++++++++++++++++++-
 include/npu2-regs.h | 18 ++++++++++
 include/opal-api.h  | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 199 insertions(+), 1 deletion(-)

Comments

Balbir Singh Nov. 21, 2017, 2:51 a.m. UTC | #1
On Mon, Nov 13, 2017 at 9:13 PM, Balbir Singh <bsingharora@gmail.com> wrote:
> Log HMI errors as a starting step. We'll need to get Linux to deduce
> and interpret the HMI event. The expectation is that we'll add a
> corresponding handler in HMI handling parts to deal with the various
> errors that have not been unhandled.
>
> Signed-off-by: Balbir Singh <bsingharora@gmail.com>

+Mahesh. I've found the patch extremely useful during debug, Reza/Alistair?

Balbir Singh.
Alistair Popple Nov. 21, 2017, 10:52 p.m. UTC | #2
On Tue, 21 Nov 2017 01:51:39 PM Balbir Singh wrote:
> On Mon, Nov 13, 2017 at 9:13 PM, Balbir Singh <bsingharora@gmail.com> wrote:
> > Log HMI errors as a starting step. We'll need to get Linux to deduce
> > and interpret the HMI event. The expectation is that we'll add a
> > corresponding handler in HMI handling parts to deal with the various
> > errors that have not been unhandled.
> >
> > Signed-off-by: Balbir Singh <bsingharora@gmail.com>
> 
> +Mahesh. I've found the patch extremely useful during debug, Reza/Alistair?

Seems like a good idea to me and it's basically equvalent to what we do for
NVLink1. Mind sending a non-RFC patch?

- Alistair

> Balbir Singh.
diff mbox series

Patch

diff --git a/core/hmi.c b/core/hmi.c
index 07c08462..b450032c 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -23,6 +23,7 @@ 
 #include <cpu.h>
 #include <chip.h>
 #include <npu-regs.h>
+#include <npu2-regs.h>
 #include <npu.h>
 #include <capp.h>
 
@@ -517,6 +518,87 @@  static void find_nx_checkstop_reason(int flat_chip_id,
 	*event_generated = true;
 }
 
+static void find_npu2_checkstop_reason(int flat_chip_id,
+				      struct OpalHMIEvent *hmi_evt,
+				      bool *event_generated)
+{
+	struct phb *phb;
+	struct npu *p = NULL;
+	int i;
+
+	uint64_t npu2_fir;
+	uint64_t npu2_fir_mask;
+	uint64_t npu2_fir_action0;
+	uint64_t npu2_fir_action1;
+	uint64_t npu2_fir_addr;
+	uint64_t npu2_fir_mask_addr;
+	uint64_t npu2_fir_action0_addr;
+	uint64_t npu2_fir_action1_addr;
+	uint64_t fatal_errors;
+
+	/* Only check for NPU errors if the chip has a NPU */
+	if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P9)
+		return;
+
+	/* Find the NPU on the chip associated with the HMI. */
+	for_each_phb(phb) {
+		/* NOTE: if a chip ever has >1 NPU this will need adjusting */
+		if (dt_node_is_compatible(phb->dt_node, "ibm,power9-npu-pciex") &&
+		    (dt_get_chip_id(phb->dt_node) == flat_chip_id)) {
+			p = phb_to_npu(phb);
+			break;
+		}
+	}
+
+	/* If we didn't find a NPU on the chip, it's not our checkstop. */
+	if (p == NULL)
+		return;
+
+	npu2_fir_addr = NPU2_FIR_REGISTER_0;
+	npu2_fir_mask_addr = NPU2_FIR_REGISTER_0_MASK;
+	npu2_fir_action0_addr = NPU2_FIR_REGISTER_0_ACTION0;
+	npu2_fir_action1_addr = NPU2_FIR_REGISTER_0_ACTION1;
+
+	for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
+		/* Read all the registers necessary to find a checkstop condition. */
+		if (xscom_read(flat_chip_id, npu2_fir_addr, &npu2_fir) ||
+			xscom_read(flat_chip_id, npu2_fir_mask_addr, &npu2_fir_mask) ||
+			xscom_read(flat_chip_id, npu2_fir_action0_addr, &npu2_fir_action0) ||
+			xscom_read(flat_chip_id, npu2_fir_action1_addr, &npu2_fir_action1)) {
+			prerror("HMI: Couldn't read NPU FIR register%d with XSCOM\n", i);
+			continue;
+		}
+
+		fatal_errors = npu2_fir & ~npu2_fir_mask & npu2_fir_action0 & npu2_fir_action1;
+
+		/* If there's no errors, we don't need to do anything. */
+		if (!fatal_errors)
+			continue;
+
+		prlog(PR_ERR, "NPU: FIR#%d FIR 0x%016llx mask 0x%016llx\n",
+			i, npu2_fir, npu2_fir_mask);
+		prlog(PR_ERR, "NPU: ACTION0 0x%016llx, ACTION1 0x%016llx\n",
+			npu2_fir_action0, npu2_fir_action1);
+
+		/* Can't do a fence yet, we are just logging fir information for now */
+		npu2_fir_addr += NPU2_FIR_OFFSET;
+		npu2_fir_mask_addr += NPU2_FIR_OFFSET;
+		npu2_fir_action0_addr += NPU2_FIR_OFFSET;
+		npu2_fir_action1_addr += NPU2_FIR_OFFSET;
+
+	}
+
+	/* Set up the HMI event */
+	hmi_evt->severity = OpalHMI_SEV_WARNING;
+	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
+	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
+	hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
+
+	/* Marking the event as recoverable so that we don't crash */
+	queue_hmi_event(hmi_evt, 1);
+	*event_generated = true;
+}
+
 static void find_npu_checkstop_reason(int flat_chip_id,
 				      struct OpalHMIEvent *hmi_evt,
 				      bool *event_generated)
@@ -532,7 +614,7 @@  static void find_npu_checkstop_reason(int flat_chip_id,
 
 	/* Only check for NPU errors if the chip has a NPU */
 	if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P8NVL)
-		return;
+		return find_npu2_checkstop_reason(flat_chip_id, hmi_evt, event_generated);
 
 	/* Find the NPU on the chip associated with the HMI. */
 	for_each_phb(phb) {
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index fd1367b7..b7522f4a 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -466,4 +466,22 @@  void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask);
 #define NPU2_DD1_MISC_SCOM_IND_SCOM_DATA	0x38f
 #define NPU2_MISC_SCOM_IND_SCOM_DATA		0x68f
 
+#define NPU2_FIR_REGISTER_0			0x0000000005013C00
+#define NPU2_FIR_REGISTER_0_MASK		0x0000000005013C03
+#define NPU2_FIR_REGISTER_0_ACTION0		0x0000000005013C06
+#define NPU2_FIR_REGISTER_0_ACTION1		0x0000000005013C07
+
+#define NPU2_FIR_REGISTER_1			0x0000000005013C40
+#define NPU2_FIR_REGISTER_1_MASK		0x0000000005013C43
+#define NPU2_FIR_REGISTER_1_ACTION0		0x0000000005013C46
+#define NPU2_FIR_REGISTER_1_ACTION1		0x0000000005013C47
+
+#define NPU2_FIR_REGISTER_2			0x0000000005013C80
+#define NPU2_FIR_REGISTER_2_MASK		0x0000000005013C83
+#define NPU2_FIR_REGISTER_2_ACTION0		0x0000000005013C86
+#define NPU2_FIR_REGISTER_2_ACTION1		0x0000000005013C87
+
+#define NPU2_TOTAL_FIR_REGISTERS		3
+#define NPU2_FIR_OFFSET				0x40
+
 #endif /* __NPU2_REGS_H */
diff --git a/include/opal-api.h b/include/opal-api.h
index 0bc036ed..cef88f68 100644
--- a/include/opal-api.h
+++ b/include/opal-api.h
@@ -725,6 +725,104 @@  enum OpalHMI_NestAccelXstopReason {
 	NX_CHECKSTOP_PBI_ISN_UE			= 0x00002000,
 };
 
+/*
+ * Can't use enums for 64 bit values, use #defines
+ */
+#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_CE		PPC_BIT(0)
+#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_HDR_CE		PPC_BIT(1)
+#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_DATA_UE		PPC_BIT(2)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_FLIT_PERR		PPC_BIT(3)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_DATA_PERR		PPC_BIT(4)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_PKT_MALFOR		PPC_BIT(5)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_PKT_UNSUPPORTED	PPC_BIT(6)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_CONFIG_ERR		PPC_BIT(7)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_CRC_ERR		PPC_BIT(8)
+#define NPU2_CHECKSTOP_REG0_NTL_PRI_ERR			PPC_BIT(9)
+#define NPU2_CHECKSTOP_REG0_NTL_LOGIC_ERR		PPC_BIT(10)
+#define NPU2_CHECKSTOP_REG0_NTL_LMD_POISON		PPC_BIT(11)
+#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_DATA_SUE		PPC_BIT(12)
+#define NPU2_CHECKSTOP_REG0_CTL_ARRAY_CE		PPC_BIT(13)
+#define NPU2_CHECKSTOP_REG0_CTL_PBUS_RECOV_ERR		PPC_BIT(14)
+#define NPU2_CHECKSTOP_REG0_CTL_REG_RING_ERR		PPC_BIT(15)
+#define NPU2_CHECKSTOP_REG0_CTL_MMIO_ST_DATA_UE		PPC_BIT(16)
+#define NPU2_CHECKSTOP_REG0_CTL_PEF			PPC_BIT(17)
+#define NPU2_CHECKSTOP_REG0_CTL_NVL_CFG_ERR		PPC_BIT(18)
+#define NPU2_CHECKSTOP_REG0_CTL_NVL_FATAL_ERR		PPC_BIT(19)
+#define NPU2_CHECKSTOP_REG0_RESERVED_1			PPC_BIT(20)
+#define NPU2_CHECKSTOP_REG0_CTL_ARRAY_UE		PPC_BIT(21)
+#define NPU2_CHECKSTOP_REG0_CTL_PBUS_PERR		PPC_BIT(22)
+#define NPU2_CHECKSTOP_REG0_CTL_PBUS_FATAL_ERR		PPC_BIT(23)
+#define NPU2_CHECKSTOP_REG0_CTL_PBUS_CONFIG_ERR		PPC_BIT(24)
+#define NPU2_CHECKSTOP_REG0_CTL_FWD_PROGRESS_ERR	PPC_BIT(25)
+#define NPU2_CHECKSTOP_REG0_CTL_LOGIC_ERR		PPC_BIT(26)
+#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_UE		PPC_BIT(29)
+#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_CE		PPC_BIT(30)
+#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_PERR		PPC_BIT(31)
+#define NPU2_CHECKSTOP_REG0_DAT_CREG_PERR		PPC_BIT(32)
+#define NPU2_CHECKSTOP_REG0_DAT_RTAG_PERR		PPC_BIT(33)
+#define NPU2_CHECKSTOP_REG0_DAT_STATE_PERR		PPC_BIT(34)
+#define NPU2_CHECKSTOP_REG0_DAT_LOGIC_ERR		PPC_BIT(35)
+#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_SUE		PPC_BIT(36)
+#define NPU2_CHECKSTOP_REG0_DAT_PBRX_SUE		PPC_BIT(37)
+#define NPU2_CHECKSTOP_REG0_XTS_INT			PPC_BIT(40)
+#define NPU2_CHECKSTOP_REG0_XTS_SRAM_CE			PPC_BIT(41)
+#define NPU2_CHECKSTOP_REG0_XTS_SRAM_UE			PPC_BIT(42)
+#define NPU2_CHECKSTOP_REG0_XTS_PROTOCOL_CE		PPC_BIT(43)
+#define NPU2_CHECKSTOP_REG0_XTS_PROTOCOL_UE		PPC_BIT(44)
+#define NPU2_CHECKSTOP_REG0_XTS_PBUS_PROTOCOL		PPC_BIT(45)
+
+#define NPU2_CHECKSTOP_REG1_NDL_BRK0_STALL			PPC_BIT(0)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK0_NOSTALL			PPC_BIT(1)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK1_STALL			PPC_BIT(2)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK1_NOSTALL			PPC_BIT(3)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK2_STALL			PPC_BIT(4)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK2_NOSTALL			PPC_BIT(5)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK3_STALL			PPC_BIT(6)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK3_NOSTALL			PPC_BIT(7)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK4_STALL			PPC_BIT(8)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK4_NOSTALL			PPC_BIT(9)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK5_STALL			PPC_BIT(10)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK5_NOSTALL			PPC_BIT(11)
+#define NPU2_CHECKSTOP_REG1_MISC_REG_RING_ERR			PPC_BIT(12)
+#define NPU2_CHECKSTOP_REG1_MISC_INT_RA_PERR			PPC_BIT(13)
+#define NPU2_CHECKSTOP_REG1_MISC_DA_ADDR_PERR			PPC_BIT(14)
+#define NPU2_CHECKSTOP_REG1_MISC_CTRL_PERR			PPC_BIT(15)
+#define NPU2_CHECKSTOP_REG1_MISC_NMMU_ERR			PPC_BIT(16)
+#define NPU2_CHECKSTOP_REG1_ATS_TVT_ENTRY_INVALID		PPC_BIT(17)
+#define NPU2_CHECKSTOP_REG1_ATS_TVT_ADDR_RANGE_ERR		PPC_BIT(18)
+#define NPU2_CHECKSTOP_REG1_ATS_TCE_PAGE_ACCESS_CA_ERR		PPC_BIT(19)
+#define NPU2_CHECKSTOP_REG1_ATS_TCE_CACHE_MULT_HIT_ERR		PPC_BIT(20)
+#define NPU2_CHECKSTOP_REG1_ATS_TCE_PAGE_ACCESS_TW_ERR		PPC_BIT(21)
+#define NPU2_CHECKSTOP_REG1_ATS_TCE_REQ_TO_ERR			PPC_BIT(22)
+#define NPU2_CHECKSTOP_REG1_ATS_TCD_PERR			PPC_BIT(23)
+#define NPU2_CHECKSTOP_REG1_ATS_TDR_PERR			PPC_BIT(24)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_EA_UE			PPC_BIT(25)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_EA_CE			PPC_BIT(26)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_TDRMEM_UE			PPC_BIT(27)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_TDRMEM_CE			PPC_BIT(28)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_RSPOUT_UE			PPC_BIT(29)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_RSPOUT_CE			PPC_BIT(30)
+#define NPU2_CHECKSTOP_REG1_ATS_TVT_PERR			PPC_BIT(31)
+#define NPU2_CHECKSTOP_REG1_ATS_IODA_ADDR_PERR			PPC_BIT(32)
+#define NPU2_CHECKSTOP_REG1_ATS_NPU_CTRL_PERR			PPC_BIT(33)
+#define NPU2_CHECKSTOP_REG1_ATS_NPU_TOR_PERR			PPC_BIT(34)
+#define NPU2_CHECKSTOP_REG1_ATS_INVAL_IODA_TBL_SEL		PPC_BIT(35)
+
+#define NPU2_CHECKSTOP_REG2_XSL_MMIO_INVALIDATE_REQ_WHILE_1_INPROG	PPC_BIT(36)
+#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_ITAG_PORT_0			PPC_BIT(37)
+#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_ITAG_PORT_1			PPC_BIT(38)
+#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_RD_PEE_COMPLETION		PPC_BIT(39)
+#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_CO_RESP			PPC_BIT(40)
+#define NPU2_CHECKSTOP_REG2_XSL_XLAT_REQ_WHILE_SPAP_INVALID		PPC_BIT(41)
+#define NPU2_CHECKSTOP_REG2_XSL_INVALID_PEE				PPC_BIT(42)
+#define NPU2_CHECKSTOP_REG2_XSL_BLOOM_FILTER_PROTECT_ERR		PPC_BIT(43)
+#define NPU2_CHECKSTOP_REG2_XSL_CE					PPC_BIT(46)
+#define NPU2_CHECKSTOP_REG2_XSL_UE					PPC_BIT(47)
+#define NPU2_CHECKSTOP_REG2_XSL_SLBI_TLBI_BUFF_OVERFLOW			PPC_BIT(48)
+#define NPU2_CHECKSTOP_REG2_XSL_SBE_CORR_ERR_PB_CHKOUT_RSP_DATA		PPC_BIT(49)
+#define NPU2_CHECKSTOP_REG2_XSL_UE_PB_CHKOUT_RSP_DATA			PPC_BIT(50)
+#define NPU2_CHECKSTOP_REG2_XSL_SUE_PB_CHKOUT_RSP_DATA			PPC_BIT(51)
+
 struct OpalHMIEvent {
 	uint8_t		version;	/* 0x00 */
 	uint8_t		severity;	/* 0x01 */