[1/2] hw/npu2: Implement logging HMI actions

Message ID 20171130125051.1433-1-bsingharora@gmail.com
State New
Headers show
Series
  • [1/2] hw/npu2: Implement logging HMI actions
Related show

Commit Message

Balbir Singh Nov. 30, 2017, 12:50 p.m.
Log HMI errors as step 1. OS will need to deduce
and interpret the HMI event.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
---
 core/hmi.c          | 83 ++++++++++++++++++++++++++++++++++++++++++++-
 include/npu2-regs.h | 11 ++++++
 include/opal-api.h  | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 191 insertions(+), 1 deletion(-)

Patch

diff --git a/core/hmi.c b/core/hmi.c
index 07c08462..bb144512 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -23,6 +23,7 @@ 
 #include <cpu.h>
 #include <chip.h>
 #include <npu-regs.h>
+#include <npu2-regs.h>
 #include <npu.h>
 #include <capp.h>
 
@@ -517,6 +518,86 @@  static void find_nx_checkstop_reason(int flat_chip_id,
 	*event_generated = true;
 }
 
+static void find_npu2_checkstop_reason(int flat_chip_id,
+				      struct OpalHMIEvent *hmi_evt,
+				      bool *event_generated)
+{
+	struct phb *phb;
+	struct npu *p = NULL;
+	int i;
+
+	uint64_t npu2_fir;
+	uint64_t npu2_fir_mask;
+	uint64_t npu2_fir_action0;
+	uint64_t npu2_fir_action1;
+	uint64_t npu2_fir_addr;
+	uint64_t npu2_fir_mask_addr;
+	uint64_t npu2_fir_action0_addr;
+	uint64_t npu2_fir_action1_addr;
+	uint64_t fatal_errors;
+	int total_errors = 0;
+
+	/* Find the NPU on the chip associated with the HMI. */
+	for_each_phb(phb) {
+		/* NOTE: if a chip ever has >1 NPU this will need adjusting */
+		if (dt_node_is_compatible(phb->dt_node, "ibm,power9-npu-pciex") &&
+		    (dt_get_chip_id(phb->dt_node) == flat_chip_id)) {
+			p = phb_to_npu(phb);
+			break;
+		}
+	}
+
+	/* If we didn't find a NPU on the chip, it's not our checkstop. */
+	if (p == NULL)
+		return;
+
+	npu2_fir_addr = NPU2_FIR_REGISTER_0;
+	npu2_fir_mask_addr = NPU2_FIR_REGISTER_0 + NPU2_FIR_MASK_OFFSET;
+	npu2_fir_action0_addr = NPU2_FIR_REGISTER_0 + NPU2_FIR_ACTION0_OFFSET;
+	npu2_fir_action1_addr = NPU2_FIR_REGISTER_0 + NPU2_FIR_ACTION1_OFFSET;
+
+	for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
+		/* Read all the registers necessary to find a checkstop condition. */
+		if (xscom_read(flat_chip_id, npu2_fir_addr, &npu2_fir) ||
+			xscom_read(flat_chip_id, npu2_fir_mask_addr, &npu2_fir_mask) ||
+			xscom_read(flat_chip_id, npu2_fir_action0_addr, &npu2_fir_action0) ||
+			xscom_read(flat_chip_id, npu2_fir_action1_addr, &npu2_fir_action1)) {
+			prerror("HMI: Couldn't read NPU FIR register%d with XSCOM\n", i);
+			continue;
+		}
+
+		fatal_errors = npu2_fir & ~npu2_fir_mask & npu2_fir_action0 & npu2_fir_action1;
+
+		if (fatal_errors) {
+			prlog(PR_ERR, "NPU: FIR#%d FIR 0x%016llx mask 0x%016llx\n",
+					i, npu2_fir, npu2_fir_mask);
+			prlog(PR_ERR, "NPU: ACTION0 0x%016llx, ACTION1 0x%016llx\n",
+					npu2_fir_action0, npu2_fir_action1);
+			total_errors++;
+		}
+
+		/* Can't do a fence yet, we are just logging fir information for now */
+		npu2_fir_addr += NPU2_FIR_OFFSET;
+		npu2_fir_mask_addr += NPU2_FIR_OFFSET;
+		npu2_fir_action0_addr += NPU2_FIR_OFFSET;
+		npu2_fir_action1_addr += NPU2_FIR_OFFSET;
+
+	}
+
+	if (!total_errors)
+		return;
+
+	/* Set up the HMI event */
+	hmi_evt->severity = OpalHMI_SEV_WARNING;
+	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
+	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
+	hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
+
+	/* Marking the event as recoverable so that we don't crash */
+	queue_hmi_event(hmi_evt, 1);
+	*event_generated = true;
+}
+
 static void find_npu_checkstop_reason(int flat_chip_id,
 				      struct OpalHMIEvent *hmi_evt,
 				      bool *event_generated)
@@ -532,7 +613,7 @@  static void find_npu_checkstop_reason(int flat_chip_id,
 
 	/* Only check for NPU errors if the chip has a NPU */
 	if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P8NVL)
-		return;
+		return find_npu2_checkstop_reason(flat_chip_id, hmi_evt, event_generated);
 
 	/* Find the NPU on the chip associated with the HMI. */
 	for_each_phb(phb) {
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index fdaad192..4a5cfb8f 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -475,4 +475,15 @@  void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask);
 #define NPU2_DD1_MISC_SCOM_IND_SCOM_DATA	0x38f
 #define NPU2_MISC_SCOM_IND_SCOM_DATA		0x68f
 
+#define NPU2_FIR_OFFSET				0x40
+#define NPU2_FIR_MASK_OFFSET			0x3
+#define NPU2_FIR_ACTION0_OFFSET			0x6
+#define NPU2_FIR_ACTION1_OFFSET			0x7
+
+#define NPU2_FIR_REGISTER_0			0x0000000005013C00
+#define NPU2_FIR_REGISTER_1			0x0000000005013C40
+#define NPU2_FIR_REGISTER_2			0x0000000005013C80
+
+#define NPU2_TOTAL_FIR_REGISTERS		3
+
 #endif /* __NPU2_REGS_H */
diff --git a/include/opal-api.h b/include/opal-api.h
index e0580891..86a6103d 100644
--- a/include/opal-api.h
+++ b/include/opal-api.h
@@ -725,6 +725,104 @@  enum OpalHMI_NestAccelXstopReason {
 	NX_CHECKSTOP_PBI_ISN_UE			= 0x00002000,
 };
 
+/*
+ * Can't use enums for 64 bit values, use #defines
+ */
+#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_CE		PPC_BIT(0)
+#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_HDR_CE		PPC_BIT(1)
+#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_DATA_UE		PPC_BIT(2)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_FLIT_PERR		PPC_BIT(3)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_DATA_PERR		PPC_BIT(4)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_PKT_MALFOR		PPC_BIT(5)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_PKT_UNSUPPORTED	PPC_BIT(6)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_CONFIG_ERR		PPC_BIT(7)
+#define NPU2_CHECKSTOP_REG0_NTL_NVL_CRC_ERR		PPC_BIT(8)
+#define NPU2_CHECKSTOP_REG0_NTL_PRI_ERR			PPC_BIT(9)
+#define NPU2_CHECKSTOP_REG0_NTL_LOGIC_ERR		PPC_BIT(10)
+#define NPU2_CHECKSTOP_REG0_NTL_LMD_POISON		PPC_BIT(11)
+#define NPU2_CHECKSTOP_REG0_NTL_ARRAY_DATA_SUE		PPC_BIT(12)
+#define NPU2_CHECKSTOP_REG0_CTL_ARRAY_CE		PPC_BIT(13)
+#define NPU2_CHECKSTOP_REG0_CTL_PBUS_RECOV_ERR		PPC_BIT(14)
+#define NPU2_CHECKSTOP_REG0_CTL_REG_RING_ERR		PPC_BIT(15)
+#define NPU2_CHECKSTOP_REG0_CTL_MMIO_ST_DATA_UE		PPC_BIT(16)
+#define NPU2_CHECKSTOP_REG0_CTL_PEF			PPC_BIT(17)
+#define NPU2_CHECKSTOP_REG0_CTL_NVL_CFG_ERR		PPC_BIT(18)
+#define NPU2_CHECKSTOP_REG0_CTL_NVL_FATAL_ERR		PPC_BIT(19)
+#define NPU2_CHECKSTOP_REG0_RESERVED_1			PPC_BIT(20)
+#define NPU2_CHECKSTOP_REG0_CTL_ARRAY_UE		PPC_BIT(21)
+#define NPU2_CHECKSTOP_REG0_CTL_PBUS_PERR		PPC_BIT(22)
+#define NPU2_CHECKSTOP_REG0_CTL_PBUS_FATAL_ERR		PPC_BIT(23)
+#define NPU2_CHECKSTOP_REG0_CTL_PBUS_CONFIG_ERR		PPC_BIT(24)
+#define NPU2_CHECKSTOP_REG0_CTL_FWD_PROGRESS_ERR	PPC_BIT(25)
+#define NPU2_CHECKSTOP_REG0_CTL_LOGIC_ERR		PPC_BIT(26)
+#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_UE		PPC_BIT(29)
+#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_CE		PPC_BIT(30)
+#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_PERR		PPC_BIT(31)
+#define NPU2_CHECKSTOP_REG0_DAT_CREG_PERR		PPC_BIT(32)
+#define NPU2_CHECKSTOP_REG0_DAT_RTAG_PERR		PPC_BIT(33)
+#define NPU2_CHECKSTOP_REG0_DAT_STATE_PERR		PPC_BIT(34)
+#define NPU2_CHECKSTOP_REG0_DAT_LOGIC_ERR		PPC_BIT(35)
+#define NPU2_CHECKSTOP_REG0_DAT_DATA_BE_SUE		PPC_BIT(36)
+#define NPU2_CHECKSTOP_REG0_DAT_PBRX_SUE		PPC_BIT(37)
+#define NPU2_CHECKSTOP_REG0_XTS_INT			PPC_BIT(40)
+#define NPU2_CHECKSTOP_REG0_XTS_SRAM_CE			PPC_BIT(41)
+#define NPU2_CHECKSTOP_REG0_XTS_SRAM_UE			PPC_BIT(42)
+#define NPU2_CHECKSTOP_REG0_XTS_PROTOCOL_CE		PPC_BIT(43)
+#define NPU2_CHECKSTOP_REG0_XTS_PROTOCOL_UE		PPC_BIT(44)
+#define NPU2_CHECKSTOP_REG0_XTS_PBUS_PROTOCOL		PPC_BIT(45)
+
+#define NPU2_CHECKSTOP_REG1_NDL_BRK0_STALL			PPC_BIT(0)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK0_NOSTALL			PPC_BIT(1)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK1_STALL			PPC_BIT(2)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK1_NOSTALL			PPC_BIT(3)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK2_STALL			PPC_BIT(4)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK2_NOSTALL			PPC_BIT(5)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK3_STALL			PPC_BIT(6)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK3_NOSTALL			PPC_BIT(7)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK4_STALL			PPC_BIT(8)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK4_NOSTALL			PPC_BIT(9)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK5_STALL			PPC_BIT(10)
+#define NPU2_CHECKSTOP_REG1_NDL_BRK5_NOSTALL			PPC_BIT(11)
+#define NPU2_CHECKSTOP_REG1_MISC_REG_RING_ERR			PPC_BIT(12)
+#define NPU2_CHECKSTOP_REG1_MISC_INT_RA_PERR			PPC_BIT(13)
+#define NPU2_CHECKSTOP_REG1_MISC_DA_ADDR_PERR			PPC_BIT(14)
+#define NPU2_CHECKSTOP_REG1_MISC_CTRL_PERR			PPC_BIT(15)
+#define NPU2_CHECKSTOP_REG1_MISC_NMMU_ERR			PPC_BIT(16)
+#define NPU2_CHECKSTOP_REG1_ATS_TVT_ENTRY_INVALID		PPC_BIT(17)
+#define NPU2_CHECKSTOP_REG1_ATS_TVT_ADDR_RANGE_ERR		PPC_BIT(18)
+#define NPU2_CHECKSTOP_REG1_ATS_TCE_PAGE_ACCESS_CA_ERR		PPC_BIT(19)
+#define NPU2_CHECKSTOP_REG1_ATS_TCE_CACHE_MULT_HIT_ERR		PPC_BIT(20)
+#define NPU2_CHECKSTOP_REG1_ATS_TCE_PAGE_ACCESS_TW_ERR		PPC_BIT(21)
+#define NPU2_CHECKSTOP_REG1_ATS_TCE_REQ_TO_ERR			PPC_BIT(22)
+#define NPU2_CHECKSTOP_REG1_ATS_TCD_PERR			PPC_BIT(23)
+#define NPU2_CHECKSTOP_REG1_ATS_TDR_PERR			PPC_BIT(24)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_EA_UE			PPC_BIT(25)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_EA_CE			PPC_BIT(26)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_TDRMEM_UE			PPC_BIT(27)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_TDRMEM_CE			PPC_BIT(28)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_RSPOUT_UE			PPC_BIT(29)
+#define NPU2_CHECKSTOP_REG1_ATS_AT_RSPOUT_CE			PPC_BIT(30)
+#define NPU2_CHECKSTOP_REG1_ATS_TVT_PERR			PPC_BIT(31)
+#define NPU2_CHECKSTOP_REG1_ATS_IODA_ADDR_PERR			PPC_BIT(32)
+#define NPU2_CHECKSTOP_REG1_ATS_NPU_CTRL_PERR			PPC_BIT(33)
+#define NPU2_CHECKSTOP_REG1_ATS_NPU_TOR_PERR			PPC_BIT(34)
+#define NPU2_CHECKSTOP_REG1_ATS_INVAL_IODA_TBL_SEL		PPC_BIT(35)
+
+#define NPU2_CHECKSTOP_REG2_XSL_MMIO_INVALIDATE_REQ_WHILE_1_INPROG	PPC_BIT(36)
+#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_ITAG_PORT_0			PPC_BIT(37)
+#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_ITAG_PORT_1			PPC_BIT(38)
+#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_RD_PEE_COMPLETION		PPC_BIT(39)
+#define NPU2_CHECKSTOP_REG2_XSL_UNEXPECTED_CO_RESP			PPC_BIT(40)
+#define NPU2_CHECKSTOP_REG2_XSL_XLAT_REQ_WHILE_SPAP_INVALID		PPC_BIT(41)
+#define NPU2_CHECKSTOP_REG2_XSL_INVALID_PEE				PPC_BIT(42)
+#define NPU2_CHECKSTOP_REG2_XSL_BLOOM_FILTER_PROTECT_ERR		PPC_BIT(43)
+#define NPU2_CHECKSTOP_REG2_XSL_CE					PPC_BIT(46)
+#define NPU2_CHECKSTOP_REG2_XSL_UE					PPC_BIT(47)
+#define NPU2_CHECKSTOP_REG2_XSL_SLBI_TLBI_BUFF_OVERFLOW			PPC_BIT(48)
+#define NPU2_CHECKSTOP_REG2_XSL_SBE_CORR_ERR_PB_CHKOUT_RSP_DATA		PPC_BIT(49)
+#define NPU2_CHECKSTOP_REG2_XSL_UE_PB_CHKOUT_RSP_DATA			PPC_BIT(50)
+#define NPU2_CHECKSTOP_REG2_XSL_SUE_PB_CHKOUT_RSP_DATA			PPC_BIT(51)
+
 struct OpalHMIEvent {
 	uint8_t		version;	/* 0x00 */
 	uint8_t		severity;	/* 0x01 */