[2/2] npu2.c: Add PE error detection

Message ID 20180111042851.16548-3-alistair@popple.id.au
State Accepted
Headers show
Series
  • Enable reporting of frozen NVLink bricks
Related show

Commit Message

Alistair Popple Jan. 11, 2018, 4:28 a.m.
Invalid accesses from the GPU can cause a specific PE to be frozen by the
NPU. Add an interrupt handler which reports the frozen PE to the operating
system via as an EEH event.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
---
 hw/npu2.c           | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 include/npu2-regs.h | 17 +----------------
 2 files changed, 54 insertions(+), 18 deletions(-)

Patch

diff --git a/hw/npu2.c b/hw/npu2.c
index 6658ab50..c88394b6 100644
--- a/hw/npu2.c
+++ b/hw/npu2.c
@@ -1213,6 +1213,35 @@  static int64_t npu2_freeze_status(struct phb *phb __unused,
 	return OPAL_SUCCESS;
 }
 
+static int64_t npu2_eeh_next_error(struct phb *phb,
+				   uint64_t *first_frozen_pe,
+				   uint16_t *pci_error_type,
+				   uint16_t *severity)
+{
+	struct npu2 *p = phb_to_npu2(phb);
+	int i;
+	uint64_t result = 0;
+
+	if (!first_frozen_pe || !pci_error_type || !severity)
+		return OPAL_PARAMETER;
+
+	*first_frozen_pe = -1;
+	*pci_error_type = OPAL_EEH_NO_ERROR;
+	*severity = OPAL_EEH_SEV_NO_ERROR;
+
+	for (i = 0; i < NPU2_MAX_PE_NUM; i++) {
+		result = npu2_read(p, NPU2_MISC_PESTB(i));
+		if (result > 0) {
+			*first_frozen_pe = i;
+			*pci_error_type = OPAL_EEH_PE_ERROR;
+			*severity = OPAL_EEH_SEV_PE_ER;
+			break;
+		}
+	}
+
+	return OPAL_SUCCESS;
+}
+
 static int64_t npu2_tce_kill(struct phb *phb, uint32_t kill_type,
 			     uint64_t pe_number, uint32_t tce_size,
 			     uint64_t dma_addr, uint32_t npages)
@@ -1281,7 +1310,7 @@  static const struct phb_ops npu_ops = {
 	.eeh_freeze_status	= npu2_freeze_status,
 	.eeh_freeze_clear	= NULL,
 	.eeh_freeze_set		= NULL,
-	.next_error		= NULL,
+	.next_error		= npu2_eeh_next_error,
 	.err_inject		= NULL,
 	.get_diag_data		= NULL,
 	.get_diag_data2		= NULL,
@@ -1814,7 +1843,14 @@  static void npu2_add_phb_properties(struct npu2 *p)
 
 static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn __unused)
 {
-	return IRQ_ATTR_TARGET_LINUX;
+	struct npu2 *p = is->data;
+	uint32_t idx = isn - p->base_lsi;
+
+	if (idx == 18)
+		/* TCE Interrupt - used to detect a frozen PE */
+		return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE;
+	else
+		return IRQ_ATTR_TARGET_LINUX;
 }
 
 static char *npu2_ipi_name(struct irq_source *is, uint32_t isn)
@@ -1852,7 +1888,22 @@  static char *npu2_ipi_name(struct irq_source *is, uint32_t isn)
 	return strdup(name);
 }
 
+static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
+{
+	struct npu2 *p = is->data;
+	uint32_t idx = isn - p->base_lsi;
+
+	if (idx != 18) {
+		prerror("OPAL received unknown NPU2 interrupt %d\n", idx);
+		return;
+	}
+
+	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+				OPAL_EVENT_PCI_ERROR);
+}
+
 static const struct irq_source_ops npu2_ipi_ops = {
+	.interrupt	= npu2_err_interrupt,
 	.attributes	= npu2_ipi_attributes,
 	.name = npu2_ipi_name,
 };
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index fdaad192..e739ac50 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -342,22 +342,7 @@  void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask);
 #define   NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE	PPC_BIT(0)
 #define   NPU2_MISC_BRICK_BDF2PE_MAP_PE		PPC_BITMASK(4,7)
 #define   NPU2_MISC_BRICK_BDF2PE_MAP_BDF	PPC_BITMASK(8,23)
-#define NPU2_MISC_PESTB00			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x200)
-#define NPU2_MISC_PESTB01			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x208)
-#define NPU2_MISC_PESTB02			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x210)
-#define NPU2_MISC_PESTB03			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x218)
-#define NPU2_MISC_PESTB04			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x220)
-#define NPU2_MISC_PESTB05			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x228)
-#define NPU2_MISC_PESTB06			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x230)
-#define NPU2_MISC_PESTB07			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x238)
-#define NPU2_MISC_PESTB08			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x240)
-#define NPU2_MISC_PESTB09			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x248)
-#define NPU2_MISC_PESTB10			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x250)
-#define NPU2_MISC_PESTB11			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x258)
-#define NPU2_MISC_PESTB12			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x260)
-#define NPU2_MISC_PESTB13			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x268)
-#define NPU2_MISC_PESTB14			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x270)
-#define NPU2_MISC_PESTB15			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x278)
+#define NPU2_MISC_PESTB(pe)			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x200 + (pe)*8)
 #define NPU2_MISC_IRQ_LOG0			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x300)
 #define NPU2_MISC_IRQ_LOG01			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x308)
 #define NPU2_MISC_IRQ_LOG02			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x310)